Add language auto-detection for php, xml, html and bash
Check the beginning of file content to learn which programming language in the content. The detected lang type will be applied only if php, xml, html or bash is detected. The language type is determinated via file extension, if the file extension is unknown or the determinated lang type is different from the detected value, then the detected lang type via the file content will be used.
This commit is contained in:
parent
69a57e5e8d
commit
9b91480ecf
BIN
PowerEditor/bin/Nppold.exe
Normal file
BIN
PowerEditor/bin/Nppold.exe
Normal file
Binary file not shown.
@ -26,6 +26,7 @@
|
||||
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
#include <deque>
|
||||
#include <algorithm>
|
||||
#include <time.h>
|
||||
#include <sys/stat.h>
|
||||
#include "Buffer.h"
|
||||
@ -592,8 +593,8 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin
|
||||
|
||||
char data[blockSize + 8]; // +8 for incomplete multibyte char
|
||||
FormatType bkformat = FormatType::unknown;
|
||||
|
||||
bool res = loadFileData(doc, backupFileName?backupFileName:fullpath, data, &UnicodeConvertor, L_TEXT, encoding, &bkformat);
|
||||
LangType detectedLang = L_TEXT;
|
||||
bool res = loadFileData(doc, backupFileName ? backupFileName : fullpath, data, &UnicodeConvertor, detectedLang, encoding, &bkformat);
|
||||
if (res)
|
||||
{
|
||||
Buffer* newBuf = new Buffer(this, _nextBufferID, doc, DOC_REGULAR, fullpath);
|
||||
@ -620,6 +621,11 @@ BufferID FileManager::loadFile(const TCHAR * filename, Document doc, int encodin
|
||||
buf->setUnicodeMode(ndds._unicodeMode);
|
||||
buf->setEncoding(-1);
|
||||
|
||||
// if a language has been detected, and the detected value is different from the file extension,
|
||||
// we use the detected value
|
||||
if (detectedLang != L_TEXT && detectedLang != buf->getLangType())
|
||||
buf->setLangType(detectedLang);
|
||||
|
||||
if (encoding == -1)
|
||||
{
|
||||
// 3 formats : WIN_FORMAT, UNIX_FORMAT and MAC_FORMAT
|
||||
@ -667,8 +673,9 @@ bool FileManager::reloadBuffer(BufferID id)
|
||||
int encoding = buf->getEncoding();
|
||||
char data[blockSize + 8]; // +8 for incomplete multibyte char
|
||||
FormatType bkformat;
|
||||
LangType lang = buf->getLangType();
|
||||
|
||||
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, buf->getLangType(), encoding, &bkformat);
|
||||
bool res = loadFileData(doc, buf->getFullPathName(), data, &UnicodeConvertor, lang, encoding, &bkformat);
|
||||
buf->_canNotify = true;
|
||||
|
||||
if (res)
|
||||
@ -1245,8 +1252,60 @@ int FileManager::detectCodepage(char* buf, size_t len)
|
||||
return codepage;
|
||||
}
|
||||
|
||||
LangType FileManager::detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen)
|
||||
{
|
||||
// it detectes xml, php and bash script file
|
||||
std::string xmlHeader = "<?xml "; // length : 6
|
||||
std::string phpHeader = "<?php "; // length : 6
|
||||
std::string bashHeader = "#!/bin/sh"; // length : 9
|
||||
std::string htmlHeader2 = "<html>"; // length : 6
|
||||
std::string htmlHeader1 = "<!DOCTYPE html>"; // length : 15
|
||||
|
||||
const size_t longestLength = htmlHeader1.length(); // longest length - html header Length
|
||||
size_t i = 0;
|
||||
|
||||
for (; i < dataLen; ++i)
|
||||
{
|
||||
if (data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r')
|
||||
break;
|
||||
}
|
||||
|
||||
std::string buf2Test = std::string((const char *)data + i, longestLength);
|
||||
|
||||
auto res = std::mismatch(bashHeader.begin(), bashHeader.end(), buf2Test.begin());
|
||||
if (res.first == bashHeader.end())
|
||||
{
|
||||
return L_BASH;
|
||||
}
|
||||
|
||||
res = std::mismatch(phpHeader.begin(), phpHeader.end(), buf2Test.begin());
|
||||
if (res.first == phpHeader.end())
|
||||
{
|
||||
return L_PHP;
|
||||
}
|
||||
|
||||
res = std::mismatch(xmlHeader.begin(), xmlHeader.end(), buf2Test.begin());
|
||||
if (res.first == xmlHeader.end())
|
||||
{
|
||||
return L_XML;
|
||||
}
|
||||
|
||||
res = std::mismatch(htmlHeader1.begin(), htmlHeader1.end(), buf2Test.begin());
|
||||
if (res.first == htmlHeader1.end())
|
||||
{
|
||||
return L_HTML;
|
||||
}
|
||||
res = std::mismatch(htmlHeader2.begin(), htmlHeader2.end(), buf2Test.begin());
|
||||
if (res.first == htmlHeader2.end())
|
||||
{
|
||||
return L_HTML;
|
||||
}
|
||||
|
||||
return L_TEXT;
|
||||
}
|
||||
|
||||
inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char* data, Utf8_16_Read * UnicodeConvertor,
|
||||
LangType language, int & encoding, FormatType* pFormat)
|
||||
LangType & language, int & encoding, FormatType* pFormat)
|
||||
{
|
||||
FILE *fp = generic_fopen(filename, TEXT("rb"));
|
||||
if (!fp)
|
||||
@ -1319,9 +1378,9 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
|
||||
lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
|
||||
if (lenFile == 0) break;
|
||||
|
||||
// check if file contain any BOM
|
||||
if (isFirstTime)
|
||||
{
|
||||
// check if file contain any BOM
|
||||
if (Utf8_16_Read::determineEncoding((unsigned char *)data, lenFile) != uni8Bit)
|
||||
{
|
||||
// if file contains any BOM, then encoding will be erased,
|
||||
@ -1333,6 +1392,13 @@ inline bool FileManager::loadFileData(Document doc, const TCHAR * filename, char
|
||||
if (NppParameters::getInstance()->getNppGUI()._detectEncoding)
|
||||
encoding = detectCodepage(data, lenFile);
|
||||
}
|
||||
|
||||
if (language == L_TEXT)
|
||||
{
|
||||
// check the language du fichier
|
||||
language = detectLanguageFromTextBegining((unsigned char *)data, lenFile);
|
||||
}
|
||||
|
||||
isFirstTime = false;
|
||||
}
|
||||
|
||||
|
@ -111,7 +111,8 @@ public:
|
||||
private:
|
||||
~FileManager();
|
||||
int detectCodepage(char* buf, size_t len);
|
||||
bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType language, int& encoding, FormatType* pFormat = nullptr);
|
||||
bool loadFileData(Document doc, const TCHAR* filename, char* buffer, Utf8_16_Read* UnicodeConvertor, LangType & language, int& encoding, FormatType* pFormat = nullptr);
|
||||
LangType detectLanguageFromTextBegining(const unsigned char *data, unsigned int dataLen);
|
||||
|
||||
|
||||
private:
|
||||
|
Loading…
Reference in New Issue
Block a user