fixes unstable sorting between consecutive runs
This commit is contained in:
parent
8a23358fc8
commit
5450afd18b
1 changed files with 4 additions and 3 deletions
|
@ -34,6 +34,7 @@ Link {
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
from html import unescape
|
from html import unescape
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from util import (
|
from util import (
|
||||||
domain,
|
domain,
|
||||||
|
@ -87,7 +88,7 @@ def uniquefied_links(sorted_links):
|
||||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||||
"""
|
"""
|
||||||
|
|
||||||
unique_urls = {}
|
unique_urls = OrderedDict()
|
||||||
|
|
||||||
lower = lambda url: url.lower().strip()
|
lower = lambda url: url.lower().strip()
|
||||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||||
|
@ -100,7 +101,7 @@ def uniquefied_links(sorted_links):
|
||||||
link = merge_links(unique_urls[fuzzy_url], link)
|
link = merge_links(unique_urls[fuzzy_url], link)
|
||||||
unique_urls[fuzzy_url] = link
|
unique_urls[fuzzy_url] = link
|
||||||
|
|
||||||
unique_timestamps = {}
|
unique_timestamps = OrderedDict()
|
||||||
for link in unique_urls.values():
|
for link in unique_urls.values():
|
||||||
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
||||||
unique_timestamps[link['timestamp']] = link
|
unique_timestamps[link['timestamp']] = link
|
||||||
|
@ -108,7 +109,7 @@ def uniquefied_links(sorted_links):
|
||||||
return unique_timestamps.values()
|
return unique_timestamps.values()
|
||||||
|
|
||||||
def sorted_links(links):
|
def sorted_links(links):
|
||||||
sort_func = lambda link: (link['timestamp'], link['url'])
|
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
|
||||||
return sorted(links, key=sort_func, reverse=True)
|
return sorted(links, key=sort_func, reverse=True)
|
||||||
|
|
||||||
def links_after_timestamp(links, timestamp=None):
|
def links_after_timestamp(links, timestamp=None):
|
||||||
|
|
Loading…
Add table
Reference in a new issue