notepad-plus-plus-legacy/scintilla/boostregex/BoostRegExSearch.cxx

/**
 * Copyright (c) since 2009 Simon Steele - http://untidy.net/
 * Based on the work of Simon Steele for Programmer's Notepad 2 (http://untidy.net)
 * Converted from boost::xpressive to boost::regex and performance improvements 
 * (principally caching the compiled regex), and support for UTF8 encoded text
 * (c) 2012 Dave Brotherstone - Changes for boost::regex
 *
 * 
 */
#include <stdlib.h>
#include <iterator> 
#include "scintilla.h"
#include "Platform.h"
#include "SplitVector.h"
#include "Partitioning.h"
#include "RunStyles.h"
#include "CellBuffer.h"
#include "CharClassify.h"
#include "Decoration.h"
#include "ILexer.h"
#include "Document.h"
#include "UniConversion.h"
#include "UTF8DocumentIterator.h"
#include "AnsiDocumentIterator.h"
#include "BoostRegexSearch.h"
#include <boost/regex.hpp>
#define CP_UTF8 65001
#define SC_CP_UTF8 65001


#ifdef SCI_NAMESPACE
using namespace Scintilla;
#endif

using namespace boost;


typedef basic_regex<char> charregex_t;
typedef boost::wregex wcharregex_t;
// , std::vector<boost::sub_match<DocumentIterator> >::allocator_type
typedef match_results<UTF8DocumentIterator> utf8match_t;
typedef match_results<AnsiDocumentIterator> ansimatch_t;

class BoostRegexSearch : public RegexSearchBase
{
public:
	BoostRegexSearch() : substituted(NULL), lastCompileFlags(-1) {}
	
	virtual ~BoostRegexSearch()
	{
		if (substituted)
		{
			delete [] substituted;
			substituted = NULL;
		}
	}

	
	virtual long FindText(Document* doc, int minPos, int maxPos, const char *s,
                        bool caseSensitive, bool word, bool wordStart, int flags, int *length);
	

	virtual const char *SubstituteByPosition(Document* doc, const char *text, int *length);

private:
	wchar_t *utf8ToWchar(const char *utf8);
	char    *wcharToUtf8(const wchar_t *w);

	charregex_t m_charre;
	wcharregex_t m_wcharre;
	
	utf8match_t m_utf8match;
	ansimatch_t m_ansimatch;
	
	char *substituted;
	std::string m_lastRegexString;
	std::string m_lastRegexUtf8string;
	int lastCompileFlags;
};

#ifdef SCI_NAMESPACE
namespace Scintilla
{
#endif

RegexSearchBase *CreateRegexSearch(CharClassify* /* charClassTable */)
{
	return new BoostRegexSearch();
}

#ifdef SCI_NAMESPACE
}
#endif

/**
 * Find text in document, supporting both forward and backward
 * searches (just pass minPos > maxPos to do a backward search)
 */
 

long BoostRegexSearch::FindText(Document* doc, int minPos, int maxPos, const char *s,
                        bool caseSensitive, bool /*word*/, bool /*wordStart*/, int searchFlags, int *length) 
{
	int startPos, endPos, increment;

	if (minPos > maxPos)
	{
		startPos = maxPos;
		endPos = minPos;
		increment = -1;
	}
	else
	{
		startPos = minPos;
		endPos = maxPos;
		increment = 1;
	}

	// Range endpoints should not be inside DBCS characters, but just in case, move them.
	startPos = doc->MovePositionOutsideChar(startPos, 1, false);
	endPos = doc->MovePositionOutsideChar(endPos, 1, false);

	
	int compileFlags(regex_constants::ECMAScript);
	if (!caseSensitive)
	{
		compileFlags |= regex_constants::icase;
	}
	bool isUtf8 = (doc->CodePage() == SC_CP_UTF8);
	
	try
	{
		
		if (compileFlags != lastCompileFlags  
			|| (isUtf8 && m_lastRegexUtf8string != s)
			|| (!isUtf8 && m_lastRegexString != s)) // Test to see if we're called with the same
			                      // regex as last time, if we are, then we don't need to recompile it
		{
			if (isUtf8)
			{
				const wchar_t* wchars = utf8ToWchar(s);
				m_wcharre = wcharregex_t(wchars, static_cast<regex_constants::syntax_option_type>(compileFlags));
				delete [] wchars;
				m_lastRegexUtf8string = s;
			}
			else
			{ // Ansi
				m_charre = charregex_t(s, static_cast<regex_constants::syntax_option_type>(compileFlags));
				m_lastRegexString = s;
			}
			lastCompileFlags = compileFlags;
		}
	}
	
	catch(regex_error& /*ex*/)
	{
		// -1 is normally used for not found, -2 is used here for invalid regex
		return -2;
	}

	// Work out the range of lines we're searching across, moving beyond an empty end-of-line
	int lineRangeStart = doc->LineFromPosition(startPos);
	int lineRangeEnd = doc->LineFromPosition(endPos);
	if ((increment == 1) &&
		(startPos >= doc->LineEnd(lineRangeStart)) &&
		(lineRangeStart < lineRangeEnd)) 
	{
		// the start position is at end of line or between line end characters.
		lineRangeStart++;
		startPos = doc->LineStart(lineRangeStart);
	}

	regex_constants::match_flag_type flags(regex_constants::match_default);

	
	// Work out the flags:
	if (startPos != doc->LineStart(lineRangeStart))
	{
		flags |= regex_constants::match_not_bol;
	}

	if (endPos != doc->LineEnd(lineRangeEnd))
	{
		flags |= regex_constants::match_not_eol;
	}

	if (0 == (searchFlags & SCFIND_REGEXP_DOTMATCHESNL))
	{
		flags |= regex_constants::match_not_dot_newline;
	}
	
	int pos(-1);
	int lenRet(0);
	
	
	if (doc->CodePage() == SC_CP_UTF8)
	{
		UTF8DocumentIterator end(doc, endPos, endPos);
		bool success = boost::regex_search(UTF8DocumentIterator(doc, startPos, endPos), end, m_utf8match, m_wcharre, flags);
		if (success)
		{
			pos = m_utf8match[0].first.pos();
			lenRet = m_utf8match[0].second.pos() - pos;
			
			if (increment == -1)
			{
				// Check for the last match on this line.
				int repetitions = 100;	// Break out of infinite loop
				int previousPos = pos;
				while (success && ((pos + lenRet) <= endPos)) 
				{
					if (previousPos >= pos && 0 >= (--repetitions))
						break;
					previousPos = pos;
					success = regex_search(UTF8DocumentIterator(doc, pos + 1, endPos), end, m_utf8match, m_wcharre, flags);
					// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));
					if (success) 
					{
						if ((pos + lenRet) <= minPos) 
						{
							pos = m_utf8match[0].first.pos();
							lenRet = m_utf8match[0].second.pos() - pos;
						} 
						else 
						{
							success = 0;
						}
					}
				}
			}
			
			*length = lenRet;
		}
	}
	else
	{
		AnsiDocumentIterator end(doc, endPos, endPos);
		
		bool success = boost::regex_search(AnsiDocumentIterator(doc, startPos, endPos), end, m_ansimatch, m_charre, flags);
		if (success)
		{
			pos = m_ansimatch[0].first.pos();
			lenRet = m_ansimatch.length();
			
			if (increment == -1)
			{
				// Check for the last match on this line.
				int repetitions = 100;	// Break out of infinite loop
				int previousPos = pos;
				while (success && ((pos + lenRet) <= endPos)) 
				{
					if (previousPos >= pos && 0 >= (--repetitions))
						break;
					previousPos = pos;
					success = regex_search(AnsiDocumentIterator(doc, pos + 1, endPos), end, m_ansimatch, m_charre, flags);
					// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));
					if (success) 
					{
						if ((pos + lenRet) <= minPos) 
						{
							pos = m_ansimatch[0].first.pos();
							lenRet = m_ansimatch[0].length();
						} 
						else 
						{
							success = 0;
						}
					}
				}
			}
			
			*length = lenRet;
		}
	}
	
	return pos;
}


const char *BoostRegexSearch::SubstituteByPosition(Document* doc, const char *text, int *length) {
	delete []substituted;
	substituted = NULL;
	if (doc->CodePage() == SC_CP_UTF8)
	{
		const wchar_t* wtext = utf8ToWchar(text);
		std::wstring replaced = m_utf8match.format(wtext, boost::format_all);
		delete[] wtext;
		substituted = wcharToUtf8(replaced.c_str());
		*length = strlen(substituted);
	}
	else
	{
		std::string replaced = m_ansimatch.format(text, boost::format_all);
		*length = replaced.size();
		substituted = new char[*length + 1];
		strcpy(substituted, replaced.c_str());
	}
	return substituted;
}

wchar_t *BoostRegexSearch::utf8ToWchar(const char *utf8)
{
	int utf8Size = strlen(utf8);
	int wcharSize = UTF16Length(utf8, utf8Size);
	wchar_t *w = new wchar_t[wcharSize + 1];
	UTF16FromUTF8(utf8, utf8Size, w, wcharSize + 1);
	w[wcharSize] = 0;
	
	return w;
}

char* BoostRegexSearch::wcharToUtf8(const wchar_t *w)
{
	int wcharSize = wcslen(w);
	int charSize = UTF8Length(w, wcharSize);
	char *c = new char[charSize + 1];
	UTF8FromUTF16(w, wcharSize, c, charSize);
	c[charSize] = 0;
	return c;
}
[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3 2012-02-13 01:45:05 +00:00			`/**`
			`* Copyright (c) since 2009 Simon Steele - http://untidy.net/`
			`* Based on the work of Simon Steele for Programmer's Notepad 2 (http://untidy.net)`
			`* Converted from boost::xpressive to boost::regex and performance improvements`
			`* (principally caching the compiled regex), and support for UTF8 encoded text`
			`* (c) 2012 Dave Brotherstone - Changes for boost::regex`
			`*`
			`*`
			`*/`
			`#include <stdlib.h>`
			`#include <iterator>`
			`#include "scintilla.h"`
			`#include "Platform.h"`
			`#include "SplitVector.h"`
			`#include "Partitioning.h"`
			`#include "RunStyles.h"`
			`#include "CellBuffer.h"`
			`#include "CharClassify.h"`
			`#include "Decoration.h"`
			`#include "ILexer.h"`
			`#include "Document.h"`
			`#include "UniConversion.h"`
			`#include "UTF8DocumentIterator.h"`
			`#include "AnsiDocumentIterator.h"`
			`#include "BoostRegexSearch.h"`
			`#include <boost/regex.hpp>`
			`#define CP_UTF8 65001`
			`#define SC_CP_UTF8 65001`



			`#ifdef SCI_NAMESPACE`
			`using namespace Scintilla;`
			`#endif`

			`using namespace boost;`


			`typedef basic_regex<char> charregex_t;`
			`typedef boost::wregex wcharregex_t;`
			`// , std::vector<boost::sub_match<DocumentIterator> >::allocator_type`
			`typedef match_results<UTF8DocumentIterator> utf8match_t;`
			`typedef match_results<AnsiDocumentIterator> ansimatch_t;`

			`class BoostRegexSearch : public RegexSearchBase`
			`{`
			`public:`
			`BoostRegexSearch() : substituted(NULL), lastCompileFlags(-1) {}`

			`virtual ~BoostRegexSearch()`
			`{`
			`if (substituted)`
			`{`
			`delete [] substituted;`
			`substituted = NULL;`
			`}`
			`}`


			`virtual long FindText(Document* doc, int minPos, int maxPos, const char *s,`
			`bool caseSensitive, bool word, bool wordStart, int flags, int *length);`



			`virtual const char SubstituteByPosition(Document doc, const char text, int length);`

			`private:`
			`wchar_t utf8ToWchar(const char utf8);`
			`char wcharToUtf8(const wchar_t w);`

			`charregex_t m_charre;`
			`wcharregex_t m_wcharre;`

			`utf8match_t m_utf8match;`
			`ansimatch_t m_ansimatch;`

			`char *substituted;`
			`std::string m_lastRegexString;`
			`std::string m_lastRegexUtf8string;`
			`int lastCompileFlags;`
			`};`

			`#ifdef SCI_NAMESPACE`
			`namespace Scintilla`
			`{`
			`#endif`

			`RegexSearchBase CreateRegexSearch(CharClassify /* charClassTable */)`
			`{`
			`return new BoostRegexSearch();`
			`}`

			`#ifdef SCI_NAMESPACE`
			`}`
			`#endif`

			`/**`
			`* Find text in document, supporting both forward and backward`
			`* searches (just pass minPos > maxPos to do a backward search)`
			`*/`


			`long BoostRegexSearch::FindText(Document* doc, int minPos, int maxPos, const char *s,`
			`bool caseSensitive, bool /word/, bool /wordStart/, int searchFlags, int *length)`
			`{`
			`int startPos, endPos, increment;`

			`if (minPos > maxPos)`
			`{`
			`startPos = maxPos;`
			`endPos = minPos;`
			`increment = -1;`
			`}`
			`else`
			`{`
			`startPos = minPos;`
			`endPos = maxPos;`
			`increment = 1;`
			`}`

			`// Range endpoints should not be inside DBCS characters, but just in case, move them.`
			`startPos = doc->MovePositionOutsideChar(startPos, 1, false);`
			`endPos = doc->MovePositionOutsideChar(endPos, 1, false);`


			`int compileFlags(regex_constants::ECMAScript);`
			`if (!caseSensitive)`
			`{`
			`compileFlags \|= regex_constants::icase;`
			`}`
			`bool isUtf8 = (doc->CodePage() == SC_CP_UTF8);`

			`try`
			`{`

			`if (compileFlags != lastCompileFlags`
			`\|\| (isUtf8 && m_lastRegexUtf8string != s)`
			`\|\| (!isUtf8 && m_lastRegexString != s)) // Test to see if we're called with the same`
			`// regex as last time, if we are, then we don't need to recompile it`
			`{`
			`if (isUtf8)`
			`{`
			`const wchar_t* wchars = utf8ToWchar(s);`
			`m_wcharre = wcharregex_t(wchars, static_cast<regex_constants::syntax_option_type>(compileFlags));`
			`delete [] wchars;`
			`m_lastRegexUtf8string = s;`
			`}`
			`else`
			`{ // Ansi`
			`m_charre = charregex_t(s, static_cast<regex_constants::syntax_option_type>(compileFlags));`
			`m_lastRegexString = s;`
			`}`
			`lastCompileFlags = compileFlags;`
			`}`
			`}`

			`catch(regex_error& /ex/)`
			`{`
			`// -1 is normally used for not found, -2 is used here for invalid regex`
			`return -2;`
			`}`

			`// Work out the range of lines we're searching across, moving beyond an empty end-of-line`
			`int lineRangeStart = doc->LineFromPosition(startPos);`
			`int lineRangeEnd = doc->LineFromPosition(endPos);`
			`if ((increment == 1) &&`
			`(startPos >= doc->LineEnd(lineRangeStart)) &&`
			`(lineRangeStart < lineRangeEnd))`
			`{`
			`// the start position is at end of line or between line end characters.`
			`lineRangeStart++;`
			`startPos = doc->LineStart(lineRangeStart);`
			`}`

			`regex_constants::match_flag_type flags(regex_constants::match_default);`



			`// Work out the flags:`
			`if (startPos != doc->LineStart(lineRangeStart))`
			`{`
			`flags \|= regex_constants::match_not_bol;`
			`}`

			`if (endPos != doc->LineEnd(lineRangeEnd))`
			`{`
			`flags \|= regex_constants::match_not_eol;`
			`}`

			`if (0 == (searchFlags & SCFIND_REGEXP_DOTMATCHESNL))`
			`{`
			`flags \|= regex_constants::match_not_dot_newline;`
			`}`

			`int pos(-1);`
			`int lenRet(0);`


			`if (doc->CodePage() == SC_CP_UTF8)`
			`{`
			`UTF8DocumentIterator end(doc, endPos, endPos);`
			`bool success = boost::regex_search(UTF8DocumentIterator(doc, startPos, endPos), end, m_utf8match, m_wcharre, flags);`
			`if (success)`
			`{`
			`pos = m_utf8match[0].first.pos();`
			`lenRet = m_utf8match[0].second.pos() - pos;`

			`if (increment == -1)`
			`{`
			`// Check for the last match on this line.`
			`int repetitions = 100; // Break out of infinite loop`
			`int previousPos = pos;`
			`while (success && ((pos + lenRet) <= endPos))`
			`{`
			`if (previousPos >= pos && 0 >= (--repetitions))`
			`break;`
			`previousPos = pos;`
			`success = regex_search(UTF8DocumentIterator(doc, pos + 1, endPos), end, m_utf8match, m_wcharre, flags);`
			`// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));`
			`if (success)`
			`{`
			`if ((pos + lenRet) <= minPos)`
			`{`
			`pos = m_utf8match[0].first.pos();`
			`lenRet = m_utf8match[0].second.pos() - pos;`
			`}`
			`else`
			`{`
			`success = 0;`
			`}`
			`}`
			`}`
			`}`

			`*length = lenRet;`
			`}`
			`}`
			`else`
			`{`
			`AnsiDocumentIterator end(doc, endPos, endPos);`

			`bool success = boost::regex_search(AnsiDocumentIterator(doc, startPos, endPos), end, m_ansimatch, m_charre, flags);`
			`if (success)`
			`{`
			`pos = m_ansimatch[0].first.pos();`
			`lenRet = m_ansimatch.length();`

			`if (increment == -1)`
			`{`
			`// Check for the last match on this line.`
			`int repetitions = 100; // Break out of infinite loop`
			`int previousPos = pos;`
			`while (success && ((pos + lenRet) <= endPos))`
			`{`
			`if (previousPos >= pos && 0 >= (--repetitions))`
			`break;`
			`previousPos = pos;`
			`success = regex_search(AnsiDocumentIterator(doc, pos + 1, endPos), end, m_ansimatch, m_charre, flags);`
			`// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));`
			`if (success)`
			`{`
			`if ((pos + lenRet) <= minPos)`
			`{`
			`pos = m_ansimatch[0].first.pos();`
			`lenRet = m_ansimatch[0].length();`
			`}`
			`else`
			`{`
			`success = 0;`
			`}`
			`}`
			`}`
			`}`

			`*length = lenRet;`
			`}`
			`}`

			`return pos;`
			`}`


			`const char BoostRegexSearch::SubstituteByPosition(Document doc, const char text, int length) {`
			`delete []substituted;`
			`substituted = NULL;`
			`if (doc->CodePage() == SC_CP_UTF8)`
			`{`
			`const wchar_t* wtext = utf8ToWchar(text);`
			`std::wstring replaced = m_utf8match.format(wtext, boost::format_all);`
			`delete[] wtext;`
			`substituted = wcharToUtf8(replaced.c_str());`
			`*length = strlen(substituted);`
			`}`
			`else`
			`{`
			`std::string replaced = m_ansimatch.format(text, boost::format_all);`
			`*length = replaced.size();`
			`substituted = new char[*length + 1];`
			`strcpy(substituted, replaced.c_str());`
			`}`
			`return substituted;`
			`}`

			`wchar_t BoostRegexSearch::utf8ToWchar(const char utf8)`
			`{`
			`int utf8Size = strlen(utf8);`
			`int wcharSize = UTF16Length(utf8, utf8Size);`
			`wchar_t *w = new wchar_t[wcharSize + 1];`
			`UTF16FromUTF8(utf8, utf8Size, w, wcharSize + 1);`
			`w[wcharSize] = 0;`

			`return w;`
			`}`

			`char* BoostRegexSearch::wcharToUtf8(const wchar_t *w)`
			`{`
			`int wcharSize = wcslen(w);`
			`int charSize = UTF8Length(w, wcharSize);`
			`char *c = new char[charSize + 1];`
			`UTF8FromUTF16(w, wcharSize, c, charSize);`
			`c[charSize] = 0;`
			`return c;`
			`}`