readability url and title fixes
This commit is contained in:
parent
843d989382
commit
92f7b399ca
1 changed files with 8 additions and 6 deletions
14
archive.py
14
archive.py
|
@ -35,14 +35,16 @@ def parse_pocket_export(html):
|
||||||
for line in html:
|
for line in html:
|
||||||
match = pattern.search(line)
|
match = pattern.search(line)
|
||||||
if match:
|
if match:
|
||||||
|
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||||
|
without_scheme = fixed_url.replace('http://', '').replace('https://', '')
|
||||||
yield {
|
yield {
|
||||||
'url': match.group(1).replace('http://www.readability.com/read?url=', ''),
|
'url': fixed_url,
|
||||||
'domain': match.group(1).replace('http://', '').replace('https://', '').split('/')[0],
|
'domain': without_scheme.split('/')[0], # without pathname
|
||||||
'base_url': match.group(1).replace('https://', '').replace('http://', '').split('?')[0],
|
'base_url': without_scheme.split('?')[0], # without query args
|
||||||
'time': datetime.fromtimestamp(int(match.group(2))),
|
'time': datetime.fromtimestamp(int(match.group(2))),
|
||||||
'timestamp': match.group(2),
|
'timestamp': match.group(2),
|
||||||
'tags': match.group(3),
|
'tags': match.group(3),
|
||||||
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', ''),
|
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or without_scheme,
|
||||||
}
|
}
|
||||||
|
|
||||||
def parse_pinboard_export(html):
|
def parse_pinboard_export(html):
|
||||||
|
@ -51,13 +53,13 @@ def parse_pinboard_export(html):
|
||||||
if line:
|
if line:
|
||||||
erg = line
|
erg = line
|
||||||
yield {
|
yield {
|
||||||
'url': erg['href'].replace('http://www.readability.com/read?url=', ''),
|
'url': erg['href'],
|
||||||
'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0],
|
'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0],
|
||||||
'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0],
|
'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0],
|
||||||
'time': datetime.fromtimestamp(time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ'))),
|
'time': datetime.fromtimestamp(time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ'))),
|
||||||
'timestamp': time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ')),
|
'timestamp': time.mktime(time.strptime(erg['time'].split(',')[0],'%Y-%m-%dT%H:%M:%SZ')),
|
||||||
'tags': erg['tags'],
|
'tags': erg['tags'],
|
||||||
'title': erg['description'].replace(' — Readability', '').replace('http://www.readability.com/read?url=', ''),
|
'title': erg['description'].replace(' — Readability', ''),
|
||||||
}
|
}
|
||||||
|
|
||||||
def dump_index(links, service):
|
def dump_index(links, service):
|
||||||
|
|
Loading…
Reference in a new issue