"Proper" HTML parsing for whitelists and interpretation

This commit is contained in:
Fierelier 2021-03-19 19:14:02 +01:00
parent ad811cc9fc
commit a6bb0964cd
1 changed files with 21 additions and 20 deletions

View File

@ -23,37 +23,38 @@ class browserDoc(QTextBrowser):
def __init__(self):
HTMLParser.__init__(self)
self.output = ""
self.voidElements = ["area","base","br","col","hr","img","input","link","meta","param","command","keygen","source"]
self.blackList = ["img","script","style"]
self.tagDir = []
def handle_starttag(self,tag,attrs):
if tag == "img":
altText = False
for attr in attrs:
if attr[0] == "alt":
altText = html.escape(attr[1])
break
for attr in attrs:
if attr[0] == "src":
url = attr[1]
if not altText: altText = url.rsplit("/")[-1]
self.output += 'img:<a href="' +html.escape(url)+ '">' +html.escape(altText)+ '</a>'
return
return
if not tag in self.voidElements:
self.tagDir.append(tag)
self.output += "<" +tag
if tag in self.blackList: return
self.output += "<" +html.escape(tag)
for attr in attrs:
self.output += " " +html.escape(attr[0])
if attr[1] != None:
self.output += ' ' +html.escape(attr[0])+ '="' +html.escape(attr[1])+ '"'
else:
self.output += ' ' +html.escape(attr[0])
self.output += '="' +html.escape(attr[1])+ '"'
self.output += ">"
def handle_endtag(self,tag):
if tag in ["img"]: return
if not tag in self.voidElements:
index = len(self.tagDir) - 1
while index >= 0:
if self.tagDir[index] == tag:
self.tagDir = self.tagDir[index + 1:]
break
index -= 1
self.output += "</" +html.escape(tag)+ ">"
def handle_data(self,data):
curTag = ""
if len(self.tagDir) > 0:
curTag = self.tagDir[-1]
if curTag in self.blackList: return
self.output += data
def cRenderHtml(self,htm):