"Proper" HTML parsing for whitelists and interpretation

2021-03-19 19:14:02 +01:00 · 2021-03-19 19:14:02 +01:00 · a6bb0964cd
commit a6bb0964cd
parent ad811cc9fc
1 changed files with 21 additions and 20 deletions
--- a/addons/0.documentViewer.QTextBrowser.py
+++ b/addons/0.documentViewer.QTextBrowser.py
@ -23,37 +23,38 @@ class browserDoc(QTextBrowser):
 		def __init__(self):
 			HTMLParser.__init__(self)
 			self.output = ""
+			self.voidElements = ["area","base","br","col","hr","img","input","link","meta","param","command","keygen","source"]
+			self.blackList = ["img","script","style"]
+			self.tagDir = []
 		
 		def handle_starttag(self,tag,attrs):
-			if tag == "img":
-				altText = False
-				for attr in attrs:
-					if attr[0] == "alt":
-						altText = html.escape(attr[1])
-						break
-				
-				for attr in attrs:
-					if attr[0] == "src":
-						url = attr[1]
-						if not altText: altText = url.rsplit("/")[-1]
-						self.output += 'img:<a href="' +html.escape(url)+ '">' +html.escape(altText)+ '</a>'
-						return
-				
-				return
+			if not tag in self.voidElements:
+				self.tagDir.append(tag)
 			
-			self.output += "<" +tag
+			if tag in self.blackList: return
+			
+			self.output += "<" +html.escape(tag)
 			for attr in attrs:
+				self.output += " " +html.escape(attr[0])
 				if attr[1] != None:
-					self.output += ' ' +html.escape(attr[0])+ '="' +html.escape(attr[1])+ '"'
-				else:
-					self.output += ' ' +html.escape(attr[0])
+					self.output += '="' +html.escape(attr[1])+ '"'
 			self.output += ">"
 		
 		def handle_endtag(self,tag):
-			if tag in ["img"]: return
+			if not tag in self.voidElements:
+				index = len(self.tagDir) - 1
+				while index >= 0:
+					if self.tagDir[index] == tag:
+						self.tagDir = self.tagDir[index + 1:]
+						break
+					index -= 1
 			self.output += "</" +html.escape(tag)+ ">"
 		
 		def handle_data(self,data):
+			curTag = ""
+			if len(self.tagDir) > 0:
+				curTag = self.tagDir[-1]
+			if curTag in self.blackList: return
 			self.output += data
 	
 	def cRenderHtml(self,htm):