Add language auto-detection for php, xml, html and bash

Check the beginning of file content to learn which programming language
in the content.
The detected lang type will be applied only if php, xml, html or bash is
detected.
The language type is determinated via file extension, if the file
extension is unknown or  the determinated lang type is different from
the detected value, then the detected lang type via the file content
will be used.
This commit is contained in:
Don Ho 2015-09-16 09:08:53 +02:00
parent 69a57e5e8d
commit 9b91480ecf
3 changed files with 73 additions and 6 deletions

BIN
PowerEditor/bin/Nppold.exe Normal file

Binary file not shown.

View File

@ -26,6 +26,7 @@
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#include <deque> #include <deque>
#include <algorithm>
#include <time.h> #include <time.h>
#include <sys/stat.h> #include <sys/stat.h>
#include "Buffer.h" #include "Buffer.h"
@ -592,8 +593,8 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin
char data[blockSize + 8]; // +8 for incomplete multibyte char char data[blockSize + 8]; // +8 for incomplete multibyte char
FormatType bkformat = FormatType::unknown; FormatType bkformat = FormatType::unknown;
LangType detectedLang = L_TEXT;
bool res = loadFileData(doc, backupFileName?backupFileName:fullpath, data, &UnicodeConvertor, L_TEXT, encoding, &bkformat); bool res = loadFileData(doc, backupFileName ? backupFileName : fullpath, data, &UnicodeConvertor, detectedLang, encoding, &bkformat);
if (res) if (res)
{ {
Buffer* newBuf = new Buffer(this, _nextBufferID, doc, DOC_REGULAR, fullpath); Buffer* newBuf = new Buffer(this, _nextBufferID, doc, DOC_REGULAR, fullpath);
@ -620,6 +621,11 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin
buf->setUnicodeMode(ndds._unicodeMode); buf->setUnicodeMode(ndds._unicodeMode);
buf->setEncoding(-1); buf->setEncoding(-1);
// if a language has been detected, and the detected value is different from the file extension,
// we use the detected value
if (detectedLang != L_TEXT && detectedLang != buf->getLangType())
buf->setLangType(detectedLang);
if (encoding == -1) if (encoding == -1)
{ {
// 3 formats : WIN_FORMAT, UNIX_FORMAT and MAC_FORMAT // 3 formats : WIN_FORMAT, UNIX_FORMAT and MAC_FORMAT
@ -667,8 +673,9 @@ bool FileManager::reloadBuffer(BufferID id)
int encoding = buf->getEncoding(); int encoding = buf->getEncoding();
char data[blockSize + 8]; // +8 for incomplete multibyte char char data[blockSize + 8]; // +8 for incomplete multibyte char
FormatType bkformat; FormatType bkformat;
LangType lang = buf->getLangType();
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, buf->getLangType(), encoding, &bkformat); bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, lang, encoding, &bkformat);
buf->_canNotify = true; buf->_canNotify = true;
if (res) if (res)
@ -1245,8 +1252,60 @@ int FileManager::detectCodepage(char* buf, size_t len)
return codepage; return codepage;
} }
LangType FileManager::detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen)
{
// it detectes xml, php and bash script file
std::string xmlHeader = "<?xml "; // length : 6
std::string phpHeader = "<?php "; // length : 6
std::string bashHeader = "#!/bin/sh"; // length : 9
std::string htmlHeader2 = "<html>"; // length : 6
std::string htmlHeader1 = "<!DOCTYPE html>"; // length : 15
const size_t longestLength = htmlHeader1.length(); // longest length - html header Length
size_t i = 0;
for (; i < dataLen; ++i)
{
if (data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r')
break;
}
std::string buf2Test = std::string((const char *)data + i, longestLength);
auto res = std::mismatch(bashHeader.begin(), bashHeader.end(), buf2Test.begin());
if (res.first == bashHeader.end())
{
return L_BASH;
}
res = std::mismatch(phpHeader.begin(), phpHeader.end(), buf2Test.begin());
if (res.first == phpHeader.end())
{
return L_PHP;
}
res = std::mismatch(xmlHeader.begin(), xmlHeader.end(), buf2Test.begin());
if (res.first == xmlHeader.end())
{
return L_XML;
}
res = std::mismatch(htmlHeader1.begin(), htmlHeader1.end(), buf2Test.begin());
if (res.first == htmlHeader1.end())
{
return L_HTML;
}
res = std::mismatch(htmlHeader2.begin(), htmlHeader2.end(), buf2Test.begin());
if (res.first == htmlHeader2.end())
{
return L_HTML;
}
return L_TEXT;
}
inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char* data, Utf8_16_Read * UnicodeConvertor, inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char* data, Utf8_16_Read * UnicodeConvertor,
LangType language, int & encoding, FormatType* pFormat) LangType & language, int & encoding, FormatType* pFormat)
{ {
FILE *fp = generic_fopen(filename, TEXT("rb")); FILE *fp = generic_fopen(filename, TEXT("rb"));
if (!fp) if (!fp)
@ -1319,9 +1378,9 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar; lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
if (lenFile == 0) break; if (lenFile == 0) break;
// check if file contain any BOM
if (isFirstTime) if (isFirstTime)
{ {
// check if file contain any BOM
if (Utf8_16_Read::determineEncoding((unsigned char *)data, lenFile) != uni8Bit) if (Utf8_16_Read::determineEncoding((unsigned char *)data, lenFile) != uni8Bit)
{ {
// if file contains any BOM, then encoding will be erased, // if file contains any BOM, then encoding will be erased,
@ -1333,6 +1392,13 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
if (NppParameters::getInstance()->getNppGUI()._detectEncoding) if (NppParameters::getInstance()->getNppGUI()._detectEncoding)
encoding = detectCodepage(data, lenFile); encoding = detectCodepage(data, lenFile);
} }
if (language == L_TEXT)
{
// check the language du fichier
language = detectLanguageFromTextBegining((unsigned char *)data, lenFile);
}
isFirstTime = false; isFirstTime = false;
} }

View File

@ -111,7 +111,8 @@ public:
private: private:
~FileManager(); ~FileManager();
int detectCodepage(char* buf, size_t len); int detectCodepage(char* buf, size_t len);
bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType language, int& encoding, FormatType* pFormat = nullptr); bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType & language, int& encoding, FormatType* pFormat = nullptr);
LangType detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen);
private: private: