Add script to remove entries from index
This commit is contained in:
parent
3ef4fa2387
commit
75c062f33e
3 changed files with 60 additions and 1 deletions
|
@ -388,9 +388,13 @@ Open an [issue](https://github.com/pirate/bookmark-archiver/issues) with a descr
|
|||
**Lots of broken links from the index:**
|
||||
|
||||
Not all sites can be effectively archived with each method, that's why it's best to use a combination of `wget`, PDFs, and screenshots.
|
||||
If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues)
|
||||
If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues**
|
||||
with some of the URLs that failed to be archived and I'll investigate.
|
||||
|
||||
**Removing unwanted links from the index:**
|
||||
|
||||
If you accidentally added lots of unwanted links into index and they slow down your archiving, you can use the `bin/purge` script to remove them from your index, which removes everything matching python regexes you pass into it. E.g: `bin/purge -r 'amazon\.com' -r 'google\.com'`. It would prompt before removing links from index, but for extra safety you might want to back up `index.json` first (or put in undex version control).
|
||||
|
||||
### Hosting the Archive
|
||||
|
||||
If you're having issues trying to host the archive via nginx, make sure you already have nginx running with SSL.
|
||||
|
|
54
archiver/purge.py
Executable file
54
archiver/purge.py
Executable file
|
@ -0,0 +1,54 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from archive import parse_json_link_index
|
||||
from config import OUTPUT_DIR
|
||||
from index import write_json_links_index
|
||||
|
||||
|
||||
def cleanup_index(patterns: List[str], yes=False):
|
||||
regexes = [re.compile(p) for p in patterns]
|
||||
|
||||
index = parse_json_link_index(OUTPUT_DIR)
|
||||
links = index['links']
|
||||
|
||||
filtered = []
|
||||
remaining = []
|
||||
for l in links:
|
||||
url = l['url']
|
||||
for r in regexes:
|
||||
if r.search(url):
|
||||
filtered.append((l, r))
|
||||
break
|
||||
else:
|
||||
remaining.append(l)
|
||||
|
||||
|
||||
print("Filtered out {}/{} urls:".format(len(filtered), len(links)))
|
||||
for link, regex in filtered:
|
||||
url = link['url']
|
||||
print(" {url} via {regex}".format(url=url, regex=regex.pattern))
|
||||
|
||||
proceed = False
|
||||
if yes:
|
||||
proceed = True
|
||||
else:
|
||||
res = input("Remove {} entries from index? [y/n] ".format(len(filtered)))
|
||||
proceed = res.strip().lower() in ('y', 'yes')
|
||||
|
||||
if proceed:
|
||||
write_json_links_index(OUTPUT_DIR, remaining)
|
||||
else:
|
||||
exit('aborting')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
p = argparse.ArgumentParser('Index purging tool')
|
||||
p.add_argument('--regex', '-r', action='append', help='Python regex to filter out')
|
||||
p.add_argument('--yes', action='store_true', default=False, help='Do not propmpt for confirmation')
|
||||
|
||||
args = p.parse_args()
|
||||
regexes = args.regex
|
||||
cleanup_index(regexes, yes=args.yes)
|
1
bin/purge
Symbolic link
1
bin/purge
Symbolic link
|
@ -0,0 +1 @@
|
|||
../archiver/purge.py
|
Loading…
Reference in a new issue