diff --git a/README.md b/README.md index b8e135a7..f768c9bd 100644 --- a/README.md +++ b/README.md @@ -391,6 +391,10 @@ Not all sites can be effectively archived with each method, that's why it's best If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues) with some of the URLs that failed to be archived and I'll investigate. +**Removing unwanted links from the index:** + +If you accidentally added lots of unwanted links into index and they slow down your archiving, you can use the `bin/purge` script to remove them from your index, which removes everything matching python regexes you pass into it. E.g: `bin/purge -r 'amazon\.com' -r 'google\.com'`. It would prompt before removing links from index, but for extra safety you might want to back up `index.json` first (or put in undex version control). + ### Hosting the Archive If you're having issues trying to host the archive via nginx, make sure you already have nginx running with SSL. diff --git a/archiver/purge.py b/archiver/purge.py new file mode 100755 index 00000000..55ba6fb6 --- /dev/null +++ b/archiver/purge.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import argparse +import re +from typing import List + +from archive import parse_json_link_index +from config import OUTPUT_DIR +from index import write_json_links_index + + +def cleanup_index(patterns: List[str], yes=False): + regexes = [re.compile(p) for p in patterns] + + index = parse_json_link_index(OUTPUT_DIR) + links = index['links'] + + filtered = [] + remaining = [] + for l in links: + url = l['url'] + for r in regexes: + if r.search(url): + filtered.append((l, r)) + break + else: + remaining.append(l) + + + print("Filtered out {}/{} urls:".format(len(filtered), len(links))) + for link, regex in filtered: + url = link['url'] + print(" {url} via {regex}".format(url=url, regex=regex.pattern)) + + proceed = False + if yes: + proceed = True + else: + res = input("Remove {} entries from index? [y/n] ".format(len(filtered))) + proceed = res.strip().lower() in ('y', 'yes') + + if proceed: + write_json_links_index(OUTPUT_DIR, remaining) + else: + exit('aborting') + + +if __name__ == '__main__': + p = argparse.ArgumentParser('Index purging tool') + p.add_argument('--regex', '-r', action='append', help='Python regex to filter out') + p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation') + + args = p.parse_args() + regexes = args.regex + cleanup_index(regexes, yes=args.yes) diff --git a/bin/purge b/bin/purge new file mode 120000 index 00000000..ad99fab3 --- /dev/null +++ b/bin/purge @@ -0,0 +1 @@ +../archiver/purge.py \ No newline at end of file