global unparseUrl def unparseUrl(parsedUrl): url = "" if parsedUrl["protocol"] != "": url = parsedUrl["protocol"] + "://" +parsedUrl["domain"] if parsedUrl["path"] != "": if url != "": url += "/" + parsedUrl["path"] else: url = parsedUrl["path"] if parsedUrl["parameters"] != "": first = True for parameter in parsedUrl["parameters"]: if first == False: url += "&" else: first = True url += parameter[0] if parameter[1] != None: url += "=" + parameter[1] if parsedUrl["anchor"] != "": url += "#" + parsedUrl["anchor"] return url global parseUrl def parseUrl(url): out = { "protocol": "", "domain": "", "path": "", "parameters": [], "anchor": "" } # get anchor if True: surl = url.split("#",1) if len(surl) > 1: url = surl[0] out["anchor"] = surl[1] # get parameters if True: surl = url.split("?",1) if len(surl) > 1: url = surl[0] argListDumb = surl[1].split("&") for arg in argListDumb: argSplit = arg.split("=",1) argKey = argSplit[0] argValue = None if len(argSplit) > 1: argValue = argSplit[1] out["parameters"].append([argKey,argValue]) # get protocol hasProtocol = False for s in url: if s == "/": break if s == ":": hasProtocol = True break if hasProtocol: surl = url.split(":",1) out["protocol"] = surl[0] url = surl[1] while len(url) > 0 and url[0] == "/": url = url[1:] # get path and domain if hasProtocol == False: out["path"] = url else: surl = url.split("/",1) out["domain"] = surl[0] if len(surl) > 1: out["path"] = surl[1] return out global urlJoin def urlJoin(*args): first = True outUrl = "" for arg in args: if first == True: outUrl = arg first = False continue while len(arg > 0) and arg[0] == "/": arg = arg[1:] while len(arg > 0) and arg[-1] == "/": arg = arg[:-1] outUrl = outUrl + "/" + arg return outUrl global infoFetcher def infoFetcher(info): ''' if "Content-Base" in info["headers"]: info["baseUrl"] = info["headers"]["Content-Base"] return elif "Content-Location" in info["headers"]: info["baseUrl"] = "/" return ''' ''' https://www.w3.org/TR/WD-html40-970917/htmlweb.html User agents should calculate the base URL for resolving relative URLs according to the [RFC1808]. The following is a summary of how [RFC1808] applies to HTML. User agents should calculate the base URL according to the following precedences (highest priority to lowest): 1. The base URL is set by the BASE element. (TO BE IMPLEMENTED) 2. The base URL is given by an HTTP header (see [RFC2068]). (TO BE IMPLEMENTED) 3. By default, the base URL is that of the current document. (TO BE IMPLEMENTED) Additionally, the OBJECT and APPLET elements define attributes that take precedence over the value set by the BASE element. Please consult the definitions of these elements for more information about URL issues specific to them. ''' ''' https://tools.ietf.org/html/rfc2068 14.11 Content-Base The Content-Base entity-header field may be used to specify the base URI for resolving relative URLs within the entity. This header field is described as Base in RFC 1808, which is expected to be revised. Content-Base = "Content-Base" ":" absoluteURI If no Content-Base field is present, the base URI of an entity is defined either by its Content-Location (if that Content-Location URI is an absolute URI) or the URI used to initiate the request, in that order of precedence. Note, however, that the base URI of the contents within the entity-body may be redefined within that entity-body. 14.15 Content-Location The Content-Location entity-header field may be used to supply the resource location for the entity enclosed in the message. In the case where a resource has multiple entities associated with it, and those entities actually have separate locations by which they might be individually accessed, the server should provide a Content-Location for the particular variant which is returned. In addition, a server SHOULD provide a Content-Location for the resource corresponding to the response entity. Content-Location = "Content-Location" ":" ( absoluteURI | relativeURI ) If no Content-Base header field is present, the value of Content- Location also defines the base URL for the entity (see section 14.11). The Content-Location value is not a replacement for the original requested URI; it is only a statement of the location of the resource corresponding to this particular entity at the time of the request. Future requests MAY use the Content-Location URI if the desire is to identify the source of that particular entity. A cache cannot assume that an entity with a Content-Location different from the URI used to retrieve it can be used to respond to later requests on that Content-Location URI. However, the Content- Location can be used to differentiate between multiple entities retrieved from a single requested resource, as described in section 13.6. If the Content-Location is a relative URI, the URI is interpreted relative to any Content-Base URI provided in the response. If no Content-Base is provided, the relative URI is interpreted relative to the Request-URI. '''