Support for Content-Type and different encodings, no longer trying to render non-text files

2021-03-25 20:34:16 +01:00 · 2021-03-25 20:34:16 +01:00 · 3f0b844aa9
commit 3f0b844aa9
parent 5e2f94678c
4 changed files with 61 additions and 3 deletions
--- a/BirdyNet.py
+++ b/BirdyNet.py
@ -185,7 +185,19 @@ class browserWindow(QMainWindow):
 		self.cStatusBar.showMessage("Rendering...")
 		self.cStatusBar.repaint()
 		start = time.time()
-		self.cDoc.cRenderHtml(response["body"].decode("utf-8",errors="ignore"))
+		
+		htm = response["body"]
+		contentType, contentTypeArguments = getContentType(response["headers"],"text")
+		if not "charset" in contentTypeArguments: contentTypeArguments["charset"] = "utf-8"
+		print("content-type: " +contentType+ "\n" +prettyJson(contentTypeArguments))
+		
+		try:
+			htm = htm.decode(contentTypeArguments["charset"],errors="ignore")
+		except Exception as e:
+			print("decoding html as '" +contentTypeArguments["charset"]+ "' failed, trying utf-8...")
+			htm = htm.decode("utf-8",errors="ignore")
+		
+		self.cDoc.cRenderHtml(htm,contentType)
 		end = time.time()
 		print("Rendering page: " +str(end - start))
 		self.cStatusBar.showMessage("Ready")
--- a/addons/0.documentViewer.QTextBrowser.py
+++ b/addons/0.documentViewer.QTextBrowser.py
@ -54,7 +54,13 @@ class browserDoc(QTextBrowser):
 			if curTag in self.blackList: return
 			self.output += html.escape(data)
 	
-	def cRenderHtml(self,htm):
+	def cRenderHtml(self,htm,contentType):
+		if contentType != "text/html":
+			self.clear()
+			self.insertHtml("<html><body><pre>" +html.escape(htm)+ "</pre></body></html>")
+			self.update()
+			return
+		
 		parser = self.cHtmlParser()
 		parser.feed(htm)
 		
--- a/addons/0.pageDownloader.py
+++ b/addons/0.pageDownloader.py
@ -25,12 +25,21 @@ def downloadPage(window,downloadId,url,headers = False):
 	
 	try:
 		requestHandler = opener.open(url)
+		
+		# Check whether to transfer the page to the viewer, or if to open downloader
 		response["headers"] = requestHandler.getheaders()
+		contentType, contentTypeArguments = getContentType(response["headers"],"application/octet-stream")
+		if not contentType.startswith("text/"): # Make the fileDownloader handle the request instead
+			return
+		
 		response["body"] = requestHandler.read()
+		requestHandler.close()
 	except urllib.error.HTTPError as e:
 		response["body"] = e.read()
+		requestHandler.close()
 	except Exception as e:
-		response["body"] = html.escape(str(e)).encode("utf-8")
+		response["body"] = str(e).encode("utf-8")
+		response["headers"] = [["content-type","text; charset=utf-8"]]
 	
 	browserWindowsLock.acquire()
 	if not window in browserWindows:
--- a/addons/0.utils.py
+++ b/addons/0.utils.py
@ -102,6 +102,37 @@ def urlJoin(*args):
 	
 	return outUrl

+global getContentType
+def getContentType(headers,fallback):
+	contentType = fallback
+	for header in headers:
+		if header[0].lower() == "content-type":
+			contentType = header[1].lower()
+	
+	contentTypeSplit = contentType.split(";")
+	index = 0
+	length = len(contentTypeSplit)
+	while index < length:
+		s = contentTypeSplit[index]
+		while len(s) > 0 and s[0] == " ": s = s[1:]
+		while len(s) > 0 and s[-1] == " ": s = s[:-1]
+		contentTypeSplit[index] = s
+		index += 1
+	
+	contentType = contentTypeSplit.pop(0)
+	contentTypeArguments = {}
+	for arg in contentTypeSplit:
+		argSplit = arg.split("=",1)
+		if len(argSplit) < 2:
+			argSplit.append("")
+		while len(argSplit[0]) > 0 and argSplit[0][0] == " ": argSplit[0] = argSplit[0][1:]
+		while len(argSplit[0]) > 0 and argSplit[0][-1] == " ": argSplit[0] = argSplit[0][:-1]
+		while len(argSplit[1]) > 0 and argSplit[1][0] == " ": argSplit[1] = argSplit[1][1:]
+		while len(argSplit[1]) > 0 and argSplit[1][-1] == " ": argSplit[1] = argSplit[1][:-1]
+		contentTypeArguments[argSplit[0]] = argSplit[1]
+	
+	return contentType, contentTypeArguments
+
 global infoFetcher
 def infoFetcher(info):
 	''' if "Content-Base" in info["headers"]: