fix urlencoding of wget path
This commit is contained in:
parent
82fc49caee
commit
8526906779
1 changed files with 10 additions and 8 deletions
18
util.py
18
util.py
|
@ -403,8 +403,10 @@ def wget_output_path(link, look_in=None):
|
||||||
See docs on wget --adjust-extension (-E)
|
See docs on wget --adjust-extension (-E)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
return quote(link['base_url'])
|
return urlencode(link['base_url'])
|
||||||
|
|
||||||
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
# Since the wget algorithm to for -E (appending .html) is incredibly complex
|
||||||
# instead of trying to emulate it here, we just look in the output folder
|
# instead of trying to emulate it here, we just look in the output folder
|
||||||
|
@ -418,7 +420,7 @@ def wget_output_path(link, look_in=None):
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||||
]
|
]
|
||||||
if html_files:
|
if html_files:
|
||||||
return quote(os.path.join(wget_folder, html_files[0]))
|
return urlencode(os.path.join(wget_folder, html_files[0]))
|
||||||
|
|
||||||
# If finding the actual output file didn't work, fall back to the buggy
|
# If finding the actual output file didn't work, fall back to the buggy
|
||||||
# implementation of the wget .html appending algorithm
|
# implementation of the wget .html appending algorithm
|
||||||
|
@ -427,20 +429,20 @@ def wget_output_path(link, look_in=None):
|
||||||
|
|
||||||
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
|
||||||
# already ends in .html
|
# already ends in .html
|
||||||
return quote(link['base_url'])
|
return urlencode(link['base_url'])
|
||||||
else:
|
else:
|
||||||
# .html needs to be appended
|
# .html needs to be appended
|
||||||
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
|
||||||
if without_scheme.endswith('/'):
|
if without_scheme.endswith('/'):
|
||||||
if query:
|
if query:
|
||||||
return quote('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
|
||||||
return quote('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
|
||||||
else:
|
else:
|
||||||
if query:
|
if query:
|
||||||
return quote('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
|
||||||
elif '/' in without_scheme:
|
elif '/' in without_scheme:
|
||||||
return quote('#'.join([without_scheme + '.html', *split_url[1:]]))
|
return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
|
||||||
return quote(link['base_url'] + '/index.html')
|
return urlencode(link['base_url'] + '/index.html')
|
||||||
|
|
||||||
|
|
||||||
def derived_link_info(link):
|
def derived_link_info(link):
|
||||||
|
|
Loading…
Add table
Reference in a new issue