1
0
Fork 0

fix urlencoding of wget path

This commit is contained in:
Nick Sweeting 2018-04-17 09:44:07 -04:00
parent 82fc49caee
commit 8526906779

18
util.py
View file

@ -403,8 +403,10 @@ def wget_output_path(link, look_in=None):
See docs on wget --adjust-extension (-E) See docs on wget --adjust-extension (-E)
""" """
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
return quote(link['base_url']) return urlencode(link['base_url'])
# Since the wget algorithm to for -E (appending .html) is incredibly complex # Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder # instead of trying to emulate it here, we just look in the output folder
@ -418,7 +420,7 @@ def wget_output_path(link, look_in=None):
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M) if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
] ]
if html_files: if html_files:
return quote(os.path.join(wget_folder, html_files[0])) return urlencode(os.path.join(wget_folder, html_files[0]))
# If finding the actual output file didn't work, fall back to the buggy # If finding the actual output file didn't work, fall back to the buggy
# implementation of the wget .html appending algorithm # implementation of the wget .html appending algorithm
@ -427,20 +429,20 @@ def wget_output_path(link, look_in=None):
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# already ends in .html # already ends in .html
return quote(link['base_url']) return urlencode(link['base_url'])
else: else:
# .html needs to be appended # .html needs to be appended
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
if without_scheme.endswith('/'): if without_scheme.endswith('/'):
if query: if query:
return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]])) return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
return quote('#'.join([without_scheme + 'index.html', *split_url[1:]])) return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
else: else:
if query: if query:
return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
elif '/' in without_scheme: elif '/' in without_scheme:
return quote('#'.join([without_scheme + '.html', *split_url[1:]])) return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
return quote(link['base_url'] + '/index.html') return urlencode(link['base_url'] + '/index.html')
def derived_link_info(link): def derived_link_info(link):