- 01
- 02
- 03
- 04
- 05
- 06
- 07
- 08
- 09
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
def normalize_url(url, preserve_fragment=False):
url = url.strip()
if not re.search(r'^\w+:', url):
url = 'http://' + url.lstrip('/')
if not (url.startswith('http:') or url.startswith('https:')):
return url
url = list(urlparse.urlsplit(url))
if url[0] not in ('http', 'https'):
url[0] = 'http'
url[1] = url[1].lower().encode('idna')
if type(url[2]) == unicode:
try:
url[2] = url[2].encode('ascii')
except UnicodeEncodeError:
pass
url[2] = urllib.unquote(url[2])
if type(url[2]) == unicode:
url[2] = url[2].encode('utf-8')
url[2] = urllib.quote(url[2], '/')
if type(url[3]) == unicode:
try:
url[3] = url[3].encode('ascii')
except UnicodeEncodeError:
pass
cut_params = ('utm_source', 'utm_medium', 'utm_term',
'utm_content', 'utm_campaign',
'yclid', 'gclid', 'ref')
new_qsl = []
for tag in url[3].split('&'):
if '=' in tag:
param, value = tag.split('=', 1)
param = urllib.unquote(param)
value = urllib.unquote(value)
if param in cut_params:
continue
if type(value) == unicode:
value = value.encode('utf-8')
new_tag = "%s=%s" % (urllib.quote(param), urllib.quote(value))
else:
new_tag = urllib.unquote(tag)
if type(new_tag) == unicode:
new_tag = new_tag.encode('utf-8')
new_tag = urllib.quote_plus(new_tag)
new_qsl.append(new_tag)
url[3] = '&'.join(new_qsl)
if not preserve_fragment:
url[4] = ''
return urlparse.urlunsplit(url)