1
0
Fork 0

feat: Remove index.json and index.html generation from the regular process

This commit is contained in:
Cristian 2020-10-08 11:02:26 -05:00 committed by Cristian Vargas
parent 494af5f2e1
commit ae1484b8bf
5 changed files with 25 additions and 30 deletions

View file

@ -225,7 +225,7 @@ def timed_index_update(out_path: Path):
@enforce_types @enforce_types
def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None: def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links""" """Writes links to sqlite3 file for a given list of links"""
log_indexing_process_started(len(links)) log_indexing_process_started(len(links))
@ -234,8 +234,6 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool
write_sql_main_index(links, out_dir=out_dir) write_sql_main_index(links, out_dir=out_dir)
os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
if finished:
write_static_index(links, out_dir=out_dir)
except (KeyboardInterrupt, SystemExit): except (KeyboardInterrupt, SystemExit):
stderr('[!] Warning: Still writing index to disk...', color='lightyellow') stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.') stderr(' Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
@ -246,13 +244,6 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool
log_indexing_process_finished() log_indexing_process_finished()
@enforce_types
def write_static_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
with timed_index_update(out_dir / JSON_INDEX_FILENAME):
write_json_main_index(links)
with timed_index_update(out_dir / HTML_INDEX_FILENAME):
write_html_main_index(links, out_dir=out_dir, finished=True)
@enforce_types @enforce_types
def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR): def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR):
setup_django(out_dir, check_db=True) setup_django(out_dir, check_db=True)

View file

@ -31,7 +31,6 @@ from .index import (
parse_links_from_source, parse_links_from_source,
dedupe_links, dedupe_links,
write_main_index, write_main_index,
write_static_index,
snapshot_filter, snapshot_filter,
get_indexed_folders, get_indexed_folders,
get_archived_folders, get_archived_folders,
@ -561,10 +560,7 @@ def add(urls: Union[str, List[str]],
archive_links(imported_links, overwrite=True, out_dir=out_dir) archive_links(imported_links, overwrite=True, out_dir=out_dir)
elif new_links: elif new_links:
archive_links(new_links, overwrite=False, out_dir=out_dir) archive_links(new_links, overwrite=False, out_dir=out_dir)
else:
return all_links
write_static_index([link.as_link_with_details() for link in all_links], out_dir=out_dir)
return all_links return all_links
@enforce_types @enforce_types
@ -641,7 +637,6 @@ def remove(filter_str: Optional[str]=None,
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir) all_snapshots = load_main_index(out_dir=out_dir)
write_static_index([link.as_link_with_details() for link in all_snapshots], out_dir=out_dir)
log_removal_finished(all_snapshots.count(), to_remove) log_removal_finished(all_snapshots.count(), to_remove)
return all_snapshots return all_snapshots
@ -698,7 +693,6 @@ def update(resume: Optional[float]=None,
# Step 4: Re-write links index with updated titles, icons, and resources # Step 4: Re-write links index with updated titles, icons, and resources
all_links = load_main_index(out_dir=out_dir) all_links = load_main_index(out_dir=out_dir)
write_static_index([link.as_link_with_details() for link in all_links], out_dir=out_dir)
return all_links return all_links
@enforce_types @enforce_types

View file

@ -1,5 +1,6 @@
import subprocess import subprocess
import json import json
import sqlite3
from .fixtures import * from .fixtures import *
@ -43,11 +44,16 @@ def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process, disable_extra
capture_output=True, capture_output=True,
env=disable_extractors_dict, env=disable_extractors_dict,
) )
with open(tmp_path / "index.json", "r") as f: conn = sqlite3.connect("index.sqlite3")
archive_file = f.read() c = conn.cursor()
assert "http://127.0.0.1:8080/static/example.com.html" in archive_file urls = c.execute("SELECT url from core_snapshot").fetchall()
assert "http://127.0.0.1:8080/static/iana.org.html" in archive_file conn.commit()
conn.close()
urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
assert "http://127.0.0.1:8080/static/iana.org.html" in urls
def test_overwrite_flag_is_accepted(process, disable_extractors_dict): def test_overwrite_flag_is_accepted(process, disable_extractors_dict):
@ -71,6 +77,8 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
env=disable_extractors_dict, env=disable_extractors_dict,
) )
with open(tmp_path / "index.json", "r") as f: archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
with open(archived_item_path / "index.json", "r") as f:
output_json = json.load(f) output_json = json.load(f)
assert output_json["links"][0]["history"] != {} assert output_json["history"] != {}

View file

@ -32,10 +32,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
output_json = json.load(f) output_json = json.load(f)
assert "Example Domain" == output_json['history']['title'][0]['output'] assert "Example Domain" == output_json['history']['title'][0]['output']
with open(tmp_path / "index.html", "r") as f: with open(archived_item_path / "index.html", "r") as f:
output_html = f.read() output_html = f.read()
assert "Example Domain" in output_html assert "Example Domain" in output_html
def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_WGET": "true"}) disable_extractors_dict.update({"USE_WGET": "true"})
os.chdir(tmp_path) os.chdir(tmp_path)
@ -51,7 +52,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
assert "Example Domain" == output_json['history']['title'][0]['output'] assert "Example Domain" == output_json['history']['title'][0]['output']
def test_correct_permissions_output_folder(tmp_path, process): def test_correct_permissions_output_folder(tmp_path, process):
index_files = ['index.json', 'index.html', 'index.sqlite3', 'archive'] index_files = ['index.sqlite3', 'archive']
for file in index_files: for file in index_files:
file_path = tmp_path / file file_path = tmp_path / file
assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS assert oct(file_path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
@ -113,6 +114,9 @@ def test_orphaned_folders(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path) os.chdir(tmp_path)
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
env=disable_extractors_dict) env=disable_extractors_dict)
list_process = subprocess.run(["archivebox", "list", "--json", "--with-headers"], capture_output=True)
with open(tmp_path / "index.json", "wb") as f:
f.write(list_process.stdout)
conn = sqlite3.connect("index.sqlite3") conn = sqlite3.connect("index.sqlite3")
c = conn.cursor() c = conn.cursor()
c.execute("DELETE from core_snapshot") c.execute("DELETE from core_snapshot")

View file

@ -6,10 +6,8 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
Unencoded content should not be rendered as it facilitates xss injections Unencoded content should not be rendered as it facilitates xss injections
and breaks the layout. and breaks the layout.
""" """
add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'], subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/title_with_html.com.html'],
capture_output=True, env=disable_extractors_dict) capture_output=True, env=disable_extractors_dict)
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
with open(tmp_path / "index.html", "r") as f: assert "<textarea>" not in list_process.stdout.decode("utf-8")
output_html = f.read()
assert "<textarea>" not in output_html