From 401bfcb0cd133d43f550dbe69275faf28663606b Mon Sep 17 00:00:00 2001 From: Udo Hoffmann Date: Tue, 29 Sep 2020 14:12:04 +0200 Subject: [PATCH] Add new URL parser to replace inaccurate regex detection Fix inaccurate URL detection and enhance URL detection for non-English character. Fix #3912, fix #3353, fix #4643, fix #5029, fix #6155, fix #7791, fix #8634, close #8921 --- PowerEditor/gcc/makefile | 2 +- PowerEditor/src/Notepad_plus.cpp | 434 ++++++++++++++++++++- PowerEditor/src/Notepad_plus.h | 2 - PowerEditor/visual.net/notepadPlus.vcxproj | 8 +- 4 files changed, 424 insertions(+), 22 deletions(-) diff --git a/PowerEditor/gcc/makefile b/PowerEditor/gcc/makefile index a8418f0d..b043233a 100644 --- a/PowerEditor/gcc/makefile +++ b/PowerEditor/gcc/makefile @@ -250,7 +250,7 @@ CXX = $(CROSS_COMPILE)g++ CXXFLAGS = $(INCLUDESPECIAL) -DTIXML_USE_STL -DTIXMLA_USE_STL $(UNICODE) -std=c++17 -fpermissive INCLUDES = $(patsubst %,-I%,$(DIRS)) -I./include LDFLAGS = -Wl,--subsystem,windows -municode -mwindows -LIBS = -lcomdlg32 -lcomctl32 -lgdi32 -lole32 -loleacc -lshell32 -lshlwapi -ldbghelp -lversion -lcrypt32 -lsensapi -lwintrust -lwinmm -luuid +LIBS = -lcomdlg32 -lcomctl32 -lgdi32 -lole32 -loleacc -lshell32 -lshlwapi -ldbghelp -lversion -lcrypt32 -lsensapi -lwininet -lwintrust -lwinmm -luuid RC = $(CROSS_COMPILE)windres diff --git a/PowerEditor/src/Notepad_plus.cpp b/PowerEditor/src/Notepad_plus.cpp index 988a0506..4a226af5 100644 --- a/PowerEditor/src/Notepad_plus.cpp +++ b/PowerEditor/src/Notepad_plus.cpp @@ -27,6 +27,7 @@ #include #include +#include #include "Notepad_plus.h" #include "Notepad_plus_Window.h" #include "FileDialog.h" @@ -2514,6 +2515,394 @@ void Notepad_plus::setUniModeText() _statusBar.setText(uniModeTextString.c_str(), STATUSBAR_UNICODE_TYPE); } +bool isUrlSchemeStartChar(TCHAR const c) +{ + return ((c >= 'A') && (c <= 'Z')) + || ((c >= 'a') && (c <= 'z')); +} + +bool isUrlSchemeDelimiter(TCHAR const c) // characters allowed immedeately before scheme +{ + return ! (((c >= '0') && (c <= '9')) + || ((c >= 'A') && (c <= 'Z')) + || ((c >= 'a') && (c <= 'z')) + || (c == '_')); +} + +bool isUrlTextChar(TCHAR const c) +{ + if (c <= ' ') return false; + switch (c) + { + case '"': + case '#': + case '\'': + case '<': + case '>': + case '?': + case '\0x7f': + return false; + } + return true; +} + +bool isUrlQueryDelimiter(TCHAR const c) +{ + switch(c) + { + case '&': + case '+': + case '=': + case ';': + return true; + } + return false; +} + +bool isUrlSchemeSupported(INTERNET_SCHEME s) +{ + switch (s) + { + case INTERNET_SCHEME_FTP: + case INTERNET_SCHEME_HTTP: + case INTERNET_SCHEME_HTTPS: + case INTERNET_SCHEME_MAILTO: + case INTERNET_SCHEME_FILE: + return true; + } + return false; +} + +// scanToUrlStart searches for a possible URL in . +// If a possible URL is found, then: +// - True is returned. +// - The number of characters between and the beginning of the URL candidate is stored in . +// - The length of the URL scheme is stored in . +// If no URL is found, then: +// - False is returned. +// - The number of characters between and the end of text is stored in . +bool scanToUrlStart(TCHAR *text, int textLen, int start, int* distance, int* schemeLength) +{ + int p = start; + int p0 = 0; + enum {sUnknown, sScheme} s = sUnknown; + while (p < textLen) + { + switch (s) + { + case sUnknown: + if (isUrlSchemeStartChar(text [p]) && ((p == 0) || isUrlSchemeDelimiter(text [p - 1]))) + { + p0 = p; + s = sScheme; + } + break; + + case sScheme: + if (text [p] == ':') + { + *distance = p0 - start; + *schemeLength = p - p0 + 1; + return true; + } + if (!isUrlSchemeStartChar(text [p])) + s = sUnknown; + break; + } + p++; + } + *schemeLength = 0; + *distance = p - start; + return false; +} + +// scanToUrlEnd searches the end of an URL, coarsly parsing its main parts HostAndPath, Query and Fragment. +// +// In the query part, a simple pattern is enforced, to avoid that everything goes through as a query. +// The pattern is kept simple, since there seem to be many different forms of queries used in the world. +// The objective here is not to detect whether or not a query is malformed. The objective is, to let through +// most of the real world's queries, and to sort out what is certainly not a query. +// +// The approach is: +// - A query begins with '?', followed by any number of values, +// which are separated by a single delimiter character '&', '+', '=' or ';'. +// - Each value may be enclosed in single or double quotes. +// +// The query pattern going through looks like this: +// - ?abc;def;fgh="i j k"&'l m n'+opq +// +void scanToUrlEnd(TCHAR *text, int textLen, int start, int* distance) +{ + int p = start; + TCHAR q = 0; + enum {sHostAndPath, sQuery, sQueryAfterDelimiter, sQueryQuotes, sQueryAfterQuotes, sFragment} s = sHostAndPath; + while (p < textLen) + { + switch (s) + { + case sHostAndPath: + if (text [p] == '?') + s = sQuery; + else if (text [p] == '#') + s = sFragment; + else if (!isUrlTextChar (text [p])) + { + *distance = p - start; + return; + } + break; + + case sQuery: + if (text [p] == '#') + s = sFragment; + else if (isUrlQueryDelimiter (text [p])) + s = sQueryAfterDelimiter; + else if (!isUrlTextChar(text [p])) + { + *distance = p - start; + return; + } + break; + + case sQueryAfterDelimiter: + if ((text [p] == '\'') || (text [p] == '"')) + { + q = text [p]; + s = sQueryQuotes; + } + else if (isUrlTextChar(text [p])) + s = sQuery; + else + { + *distance = p - start; + return; + } + break; + + case sQueryQuotes: + if (text [p] < ' ') + { + *distance = p - start; + return; + } + if (text [p] == q) + s = sQueryAfterQuotes; + break; + + case sQueryAfterQuotes: + if (isUrlQueryDelimiter (text [p])) + s = sQueryAfterDelimiter; + else + { + *distance = p - start; + return; + } + break; + + case sFragment: + if (!isUrlTextChar(text [p])) + { + *distance = p - start; + return; + } + break; + } + p++; + } + *distance = p - start; +} + +// removeUnwantedTrailingCharFromUrl removes a single unwanted trailing character from an URL. +// It has to be called repeatedly, until it returns false, meaning that all unwanted characters are gone. +bool removeUnwantedTrailingCharFromUrl (TCHAR const *text, int* length) +{ + int l = *length - 1; + if (l <= 0) return false; + { // remove unwanted single characters + const TCHAR *singleChars = L".,:;?!#"; + for (int i = 0; singleChars [i]; i++) + if (text [l] == singleChars [i]) + { + *length = l; + return true; + } + } + { // remove unwanted closing parenthesis + const TCHAR *closingParenthesis = L")]}>"; + const TCHAR *openingParenthesis = L"([{<"; + for (int i = 0; closingParenthesis [i]; i++) + if (text [l] == closingParenthesis [i]) + { + int count = 1; + for (int j = l - 1; j >= 0; j--) + { + if (text [j] == closingParenthesis [i]) + count++; + if (text [j] == openingParenthesis [i]) + count--; + } + if (count == 0) + return false; + *length = l; + return true; + } + } + { // remove unwanted quotes + const TCHAR *quotes = L"\"'`"; + for (int i = 0; quotes [i]; i++) + { + if (text [l] == quotes [i]) + { + int count = 0; + for (int j = l - 1; j >= 0; j--) + if (text [j] == quotes [i]) + count++; + + if (count & 1) + return false; + *length = l; + return true; + } + } + } + return false; +} + +bool isSlashOrBackslash(TCHAR const c) +{ + return (c == '/') || (c == '\\'); +} + +bool isFilenameChar(TCHAR const c, bool const quoted) +{ + if (c < ' ') + return false; + + if ((c == ' ') && (!quoted)) + return false; + + switch (c) + { + case '"': + case '%': + case '*': + case '/': + case '<': + case '>': + case ':': + case '?': + case '|': + case '\\': + case '\0x7f': + return false; + } + return true; +} + +// scanToFileEnd searches the end of an Filename, coarsly parsing it into prefix and name. +// The prefix parsing is done to avoid multiple colons. +// The parameter specifies, whether spaces are allowed. +void scanToFileEnd(TCHAR *text, int textLen, int start, bool quoted, int* distance) +{ + int p = start; + enum {sStart, sPrefix, sColon, sName} s = sStart; + while (p < textLen) + { + switch (s) + { + case sStart: + if (isFilenameChar (text [p], false)) + s = sPrefix; + else if (!isSlashOrBackslash(text [p])) + { + *distance = p - start; + return; + } + break; + + case sPrefix: + if (isSlashOrBackslash(text [p]) || isFilenameChar(text [p], quoted)) + s = sName; + else if (text [p] == ':') + s = sColon; + else + { + *distance = p - start; + return; + } + break; + + case sColon: + if (isSlashOrBackslash(text[p])) + s = sName; + else + { + *distance = p - start; + return; + } + break; + + case sName: + if (! (isSlashOrBackslash(text [p]) || isFilenameChar(text [p], quoted))) + { + *distance = p - start; + return; + } + break; + } + p++; + } + *distance = p - start; +} + +// isUrl checks, whether there is a valid URL at . +// If yes: +// - True is returned. +// - The length of the URL is stored in . +// If no: +// - False is returned. +// - The number of characters between and the next URL is stored in . +// - If no URL is found at all, then the number of characters between and the end of text is stored in . +bool isUrl(TCHAR * text, int textLen, int start, int* segmentLen) +{ + int dist = 0, schemeLen = 0; + if (scanToUrlStart(text, textLen, start, & dist, & schemeLen)) + { + if (dist) + { + *segmentLen = dist; + return false; + } + int len = 0; + scanToUrlEnd (text, textLen, start + schemeLen, & len); + if (len) + { + len += schemeLen; + URL_COMPONENTS url; + memset (& url, 0, sizeof(url)); + url.dwStructSize = sizeof(url); + bool r = InternetCrackUrl(& text [start], len, 0, & url) && isUrlSchemeSupported(url.nScheme); + if (r) + { + while (removeUnwantedTrailingCharFromUrl (& text [start], & len)); + if (url.nScheme == INTERNET_SCHEME_FILE) + { + scanToFileEnd (text, textLen, start + schemeLen, (start > 0) && (text [start - 1] == '"'), & len); + len += schemeLen; + } + *segmentLen = len; + return true; + } + } + len = 1; + int lMax = textLen - start; + while (isUrlSchemeStartChar(text[start+len]) && (len < lMax)) len++; + *segmentLen = len; + return false; + } + *segmentLen = dist; + return false; +} void Notepad_plus::addHotSpot(ScintillaEditView* view) { @@ -2547,23 +2936,38 @@ void Notepad_plus::addHotSpot(ScintillaEditView* view) LRESULT indicFore = pView->execute(SCI_STYLEGETFORE, STYLE_DEFAULT); pView->execute(SCI_SETINDICATORVALUE, indicFore); - pView->execute(SCI_SETSEARCHFLAGS, SCFIND_REGEXP|SCFIND_POSIX); - pView->execute(SCI_SETTARGETRANGE, startPos, endPos); - int posFound = static_cast(pView->execute(SCI_SEARCHINTARGET, strlen(URL_REG_EXPR), reinterpret_cast(URL_REG_EXPR))); - - while (posFound != -1 && posFound != -2) + UINT cp = static_cast(pView->execute(SCI_GETCODEPAGE)); + char *encodedText = new char[endPos - startPos + 1]; + pView->getText(encodedText, startPos, endPos); + TCHAR *wideText = new TCHAR[endPos - startPos + 1]; + int wideTextLen = MultiByteToWideChar(cp, 0, encodedText, endPos - startPos + 1, (LPWSTR) wideText, endPos - startPos + 1) - 1; + delete[] encodedText; + if (wideTextLen > 0) { - int end = int(pView->execute(SCI_GETTARGETEND)); - int foundTextLen = end - posFound; - if (posFound > startPos) - pView->execute(SCI_INDICATORCLEARRANGE, startPos, posFound - startPos); - pView->execute(SCI_INDICATORFILLRANGE, posFound, foundTextLen); - startPos = posFound + foundTextLen; - pView->execute(SCI_SETTARGETRANGE, startPos, endPos); - posFound = static_cast(pView->execute(SCI_SEARCHINTARGET, strlen(URL_REG_EXPR), reinterpret_cast(URL_REG_EXPR))); + int startWide = 0; + int lenWide = 0; + int startEncoded = 0; + int lenEncoded = 0; + while (true) + { + bool r = isUrl(wideText, wideTextLen, startWide, & lenWide); + if (lenWide <= 0) + break; + assert ((startWide + lenWide) <= wideTextLen); + lenEncoded = WideCharToMultiByte(cp, 0, & wideText [startWide], lenWide, NULL, 0, NULL, NULL); + if (r) + pView->execute(SCI_INDICATORFILLRANGE, startEncoded + startPos, lenEncoded); + else + pView->execute(SCI_INDICATORCLEARRANGE, startEncoded + startPos, lenEncoded); + startWide += lenWide; + startEncoded += lenEncoded; + if ((startWide >= wideTextLen) || ((startEncoded + startPos) >= endPos)) + break; + } + assert ((startEncoded + startPos) == endPos); + assert (startWide == wideTextLen); } - if (endPos > startPos) - pView->execute(SCI_INDICATORCLEARRANGE, startPos, endPos - startPos); + delete[] wideText; } bool Notepad_plus::isConditionExprLine(int lineNumber) diff --git a/PowerEditor/src/Notepad_plus.h b/PowerEditor/src/Notepad_plus.h index 8ac98ade..7917361c 100644 --- a/PowerEditor/src/Notepad_plus.h +++ b/PowerEditor/src/Notepad_plus.h @@ -61,8 +61,6 @@ #define MENU 0x01 #define TOOLBAR 0x02 -#define URL_REG_EXPR "[A-Za-z]+://[A-Za-z0-9_\\-\\+~.:?&@=/%#,;\\{\\}\\(\\)\\[\\]\\|\\*\\!\\\\]+" - enum FileTransferMode { TransferClone = 0x01, TransferMove = 0x02 diff --git a/PowerEditor/visual.net/notepadPlus.vcxproj b/PowerEditor/visual.net/notepadPlus.vcxproj index f801b02a..939fdc45 100755 --- a/PowerEditor/visual.net/notepadPlus.vcxproj +++ b/PowerEditor/visual.net/notepadPlus.vcxproj @@ -111,7 +111,7 @@ /fixed:no %(AdditionalOptions) - comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;%(AdditionalDependencies) + comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;wininet.lib;%(AdditionalDependencies) LinkVerboseLib $(OutDir)notepad++.exe 1.0 @@ -149,7 +149,7 @@ /fixed:no %(AdditionalOptions) - comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;%(AdditionalDependencies) + comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;wininet.lib;%(AdditionalDependencies) LinkVerboseLib $(OutDir)notepad++.exe 1.0 @@ -192,7 +192,7 @@ true - comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;%(AdditionalDependencies) + comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;wininet.lib;%(AdditionalDependencies) LinkVerboseLib $(OutDir)notepad++.exe 1.0 @@ -244,7 +244,7 @@ copy ..\src\contextMenu.xml ..\bin\contextMenu.xml true - comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;%(AdditionalDependencies) + comctl32.lib;shlwapi.lib;shell32.lib;Oleacc.lib;Dbghelp.lib;Version.lib;Crypt32.lib;wintrust.lib;Sensapi.lib;winmm.lib;wininet.lib;%(AdditionalDependencies) LinkVerboseLib $(OutDir)notepad++.exe 1.0