174 lines
5.2 KiB
Python
174 lines
5.2 KiB
Python
global unparseUrl
|
|
def unparseUrl(parsedUrl):
|
|
url = ""
|
|
if parsedUrl["protocol"] != "":
|
|
url = parsedUrl["protocol"] + "://" +parsedUrl["domain"]
|
|
|
|
if parsedUrl["path"] != "":
|
|
if url != "":
|
|
url += "/" + parsedUrl["path"]
|
|
else:
|
|
url = parsedUrl["path"]
|
|
|
|
if parsedUrl["parameters"] != "":
|
|
first = True
|
|
for parameter in parsedUrl["parameters"]:
|
|
if first == False:
|
|
url += "&"
|
|
else:
|
|
url += "?"
|
|
first = False
|
|
|
|
url += parameter[0]
|
|
if parameter[1] != None:
|
|
url += "=" + parameter[1]
|
|
|
|
if parsedUrl["anchor"] != "":
|
|
url += "#" + parsedUrl["anchor"]
|
|
|
|
return url
|
|
|
|
global parseUrl
|
|
def parseUrl(url):
|
|
out = {
|
|
"protocol": "",
|
|
"domain": "",
|
|
"path": "",
|
|
"parameters": [],
|
|
"anchor": ""
|
|
}
|
|
|
|
# get anchor
|
|
if True:
|
|
surl = url.split("#",1)
|
|
if len(surl) > 1:
|
|
url = surl[0]
|
|
out["anchor"] = surl[1]
|
|
|
|
# get parameters
|
|
if True:
|
|
surl = url.split("?",1)
|
|
if len(surl) > 1:
|
|
url = surl[0]
|
|
argListDumb = surl[1].split("&")
|
|
for arg in argListDumb:
|
|
argSplit = arg.split("=",1)
|
|
argKey = argSplit[0]
|
|
argValue = None
|
|
if len(argSplit) > 1:
|
|
argValue = argSplit[1]
|
|
out["parameters"].append([argKey,argValue])
|
|
|
|
# get protocol
|
|
hasProtocol = False
|
|
for s in url:
|
|
if s == "/":
|
|
break
|
|
|
|
if s == ":":
|
|
hasProtocol = True
|
|
break
|
|
|
|
if hasProtocol:
|
|
surl = url.split(":",1)
|
|
out["protocol"] = surl[0]
|
|
url = surl[1]
|
|
while len(url) > 0 and url[0] == "/": url = url[1:]
|
|
|
|
# get path and domain
|
|
if hasProtocol == False:
|
|
out["path"] = url
|
|
else:
|
|
surl = url.split("/",1)
|
|
out["domain"] = surl[0]
|
|
if len(surl) > 1:
|
|
out["path"] = surl[1]
|
|
|
|
return out
|
|
|
|
global urlJoin
|
|
def urlJoin(*args):
|
|
first = True
|
|
outUrl = ""
|
|
for arg in args:
|
|
if first == True:
|
|
outUrl = arg
|
|
first = False
|
|
continue
|
|
|
|
while len(arg > 0) and arg[0] == "/": arg = arg[1:]
|
|
while len(arg > 0) and arg[-1] == "/": arg = arg[:-1]
|
|
outUrl = outUrl + "/" + arg
|
|
|
|
return outUrl
|
|
|
|
global infoFetcher
|
|
def infoFetcher(info):
|
|
''' if "Content-Base" in info["headers"]:
|
|
info["baseUrl"] = info["headers"]["Content-Base"]
|
|
return
|
|
elif "Content-Location" in info["headers"]:
|
|
info["baseUrl"] = "/"
|
|
return '''
|
|
|
|
''' https://www.w3.org/TR/WD-html40-970917/htmlweb.html
|
|
User agents should calculate the base URL for resolving relative URLs according to the [RFC1808]. The following is a summary of how [RFC1808] applies to HTML. User agents should calculate the base URL according to the following precedences (highest priority to lowest):
|
|
|
|
1. The base URL is set by the BASE element. (TO BE IMPLEMENTED)
|
|
2. The base URL is given by an HTTP header (see [RFC2068]). (TO BE IMPLEMENTED)
|
|
3. By default, the base URL is that of the current document. (TO BE IMPLEMENTED)
|
|
|
|
Additionally, the OBJECT and APPLET elements define attributes that take precedence over the value set by the BASE element. Please consult the definitions of these elements for more information about URL issues specific to them.
|
|
'''
|
|
|
|
''' https://tools.ietf.org/html/rfc2068
|
|
14.11 Content-Base
|
|
|
|
The Content-Base entity-header field may be used to specify the base
|
|
URI for resolving relative URLs within the entity. This header field
|
|
is described as Base in RFC 1808, which is expected to be revised.
|
|
|
|
Content-Base = "Content-Base" ":" absoluteURI
|
|
|
|
If no Content-Base field is present, the base URI of an entity is
|
|
defined either by its Content-Location (if that Content-Location URI
|
|
is an absolute URI) or the URI used to initiate the request, in that
|
|
order of precedence. Note, however, that the base URI of the contents
|
|
within the entity-body may be redefined within that entity-body.
|
|
|
|
14.15 Content-Location
|
|
|
|
The Content-Location entity-header field may be used to supply the
|
|
resource location for the entity enclosed in the message. In the case
|
|
where a resource has multiple entities associated with it, and those
|
|
entities actually have separate locations by which they might be
|
|
individually accessed, the server should provide a Content-Location
|
|
for the particular variant which is returned. In addition, a server
|
|
SHOULD provide a Content-Location for the resource corresponding to
|
|
the response entity.
|
|
|
|
Content-Location = "Content-Location" ":"
|
|
( absoluteURI | relativeURI )
|
|
|
|
If no Content-Base header field is present, the value of Content-
|
|
Location also defines the base URL for the entity (see section
|
|
14.11).
|
|
|
|
The Content-Location value is not a replacement for the original
|
|
requested URI; it is only a statement of the location of the resource
|
|
corresponding to this particular entity at the time of the request.
|
|
Future requests MAY use the Content-Location URI if the desire is to
|
|
identify the source of that particular entity.
|
|
|
|
A cache cannot assume that an entity with a Content-Location
|
|
different from the URI used to retrieve it can be used to respond to
|
|
later requests on that Content-Location URI. However, the Content-
|
|
Location can be used to differentiate between multiple entities
|
|
retrieved from a single requested resource, as described in section
|
|
13.6.
|
|
|
|
If the Content-Location is a relative URI, the URI is interpreted
|
|
relative to any Content-Base URI provided in the response. If no
|
|
Content-Base is provided, the relative URI is interpreted relative to
|
|
the Request-URI.
|
|
''' |