1
0
Fork 0

new version handling and absolute imports

This commit is contained in:
Nick Sweeting 2019-03-27 15:35:13 -04:00
parent bc1bc9fe02
commit 93216a3c3e
9 changed files with 58 additions and 61 deletions

View file

@ -13,34 +13,37 @@ __package__ = 'archivebox'
import os
import sys
from typing import List, Optional
from schema import Link
from links import links_after_timestamp
from index import write_links_index, load_links_index
from archive_methods import archive_link
from config import (
from .schema import Link
from .links import links_after_timestamp
from .index import write_links_index, load_links_index
from .archive_methods import archive_link
from .config import (
ONLY_NEW,
OUTPUT_DIR,
GIT_SHA,
PYTHON_DIR,
VERSION,
)
from util import (
from .util import (
enforce_types,
save_remote_source,
save_stdin_source,
)
from logs import (
from .logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
)
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA[:9]
__VERSION__ = VERSION
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help():
print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:")

View file

@ -4,13 +4,13 @@ from typing import Dict, List, Tuple
from collections import defaultdict
from datetime import datetime
from schema import Link, ArchiveResult, ArchiveError
from index import (
from .schema import Link, ArchiveResult, ArchiveError
from .index import (
write_link_index,
patch_links_index,
load_json_link_index,
)
from config import (
from .config import (
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
@ -31,7 +31,7 @@ from config import (
ANSI,
OUTPUT_DIR,
GIT_DOMAINS,
GIT_SHA,
VERSION,
WGET_USER_AGENT,
CHECK_SSL_VALIDITY,
COOKIES_FILE,
@ -43,7 +43,7 @@ from config import (
ONLY_NEW,
WGET_AUTO_COMPRESSION,
)
from util import (
from .util import (
enforce_types,
domain,
extension,
@ -58,7 +58,7 @@ from util import (
run, PIPE, DEVNULL,
Link,
)
from logs import (
from .logs import (
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_started,
@ -123,6 +123,7 @@ def archive_link(link: Link, page=None) -> Link:
if was_changed:
patch_links_index(link)
log_link_archiving_finished(link.link_dir, link, is_new, stats)
except KeyboardInterrupt:
@ -606,7 +607,7 @@ def archive_dot_org(link_dir: str, link: Link, timeout: int=TIMEOUT) -> ArchiveR
CURL_BINARY,
'--location',
'--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout),
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
submit_url,

View file

@ -40,7 +40,7 @@ SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True'
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
@ -163,21 +163,13 @@ def find_chrome_data_dir() -> Optional[str]:
return None
def get_git_version() -> str:
"""get the git commit hash of the python code folder (aka code version)"""
try:
return run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
return 'unknown'
# ******************************************************************************
# ************************ Environment & Dependencies **************************
# ******************************************************************************
try:
GIT_SHA = get_git_version()
VERSION = open(os.path.join(PYTHON_DIR, 'VERSION'), 'r').read().strip()
GIT_SHA = VERSION.split('+')[1]
### Terminal Configuration
TERM_WIDTH = lambda: shutil.get_terminal_size((100, 10)).columns
@ -234,7 +226,7 @@ try:
WGET_AUTO_COMPRESSION = not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL).returncode
WGET_USER_AGENT = WGET_USER_AGENT.format(
GIT_SHA=GIT_SHA[:9],
VERSION=VERSION,
WGET_VERSION=WGET_VERSION or '',
)

View file

@ -6,15 +6,16 @@ from string import Template
from typing import List, Tuple, Iterator, Optional
from dataclasses import fields
from schema import Link, ArchiveIndex, ArchiveResult
from config import (
from .schema import Link, ArchiveResult
from .config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
TIMEOUT,
)
from util import (
from .util import (
merge_links,
chmod_file,
urlencode,
@ -25,9 +26,9 @@ from util import (
TimedProgress,
copy_and_overwrite,
)
from parse import parse_links
from links import validate_links
from logs import (
from .parse import parse_links
from .links import validate_links
from .logs import (
log_indexing_process_started,
log_indexing_started,
log_indexing_finished,
@ -178,8 +179,8 @@ def write_html_links_index(out_dir: str, links: List[Link], finished: bool=False
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'version': VERSION,
'git_sha': GIT_SHA,
'short_git_sha': GIT_SHA[:8],
'rows': link_rows,
'status': 'finished' if finished else 'running',
}

View file

@ -22,8 +22,8 @@ Link {
from typing import Iterable
from collections import OrderedDict
from schema import Link
from util import (
from .schema import Link
from .util import (
scheme,
fuzzy_url,
merge_links,

View file

@ -24,8 +24,8 @@ from typing import Tuple, List, IO, Iterable
from datetime import datetime
import xml.etree.ElementTree as etree
from config import TIMEOUT
from util import (
from .config import TIMEOUT
from .util import (
htmldecode,
str_between,
URL_REGEX,

View file

@ -108,60 +108,60 @@ class Link:
@property
def link_dir(self) -> str:
from config import ARCHIVE_DIR
from .config import ARCHIVE_DIR
return os.path.join(ARCHIVE_DIR, self.timestamp)
@property
def archive_path(self) -> str:
from config import ARCHIVE_DIR_NAME
from .config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
### URL Helpers
@property
def urlhash(self):
from util import hashurl
from .util import hashurl
return hashurl(self.url)
@property
def extension(self) -> str:
from util import extension
from .util import extension
return extension(self.url)
@property
def domain(self) -> str:
from util import domain
from .util import domain
return domain(self.url)
@property
def path(self) -> str:
from util import path
from .util import path
return path(self.url)
@property
def basename(self) -> str:
from util import basename
from .util import basename
return basename(self.url)
@property
def base_url(self) -> str:
from util import base_url
from .util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@property
def bookmarked_date(self) -> Optional[str]:
from util import ts_to_date
from .util import ts_to_date
return ts_to_date(self.timestamp) if self.timestamp else None
@property
def updated_date(self) -> Optional[str]:
from util import ts_to_date
from .util import ts_to_date
return ts_to_date(self.updated) if self.updated else None
@property
def oldest_archive_date(self) -> Optional[datetime]:
from util import ts_to_date
from .util import ts_to_date
most_recent = min(
(ts_to_date(result.start_ts)
@ -173,7 +173,7 @@ class Link:
@property
def newest_archive_date(self) -> Optional[datetime]:
from util import ts_to_date
from .util import ts_to_date
most_recent = max(
(ts_to_date(result.start_ts)
@ -197,13 +197,13 @@ class Link:
@property
def is_static(self) -> bool:
from util import is_static_file
from .util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
from config import ARCHIVE_DIR
from util import domain
from .config import ARCHIVE_DIR
from .util import domain
return os.path.exists(os.path.join(
ARCHIVE_DIR,
@ -240,7 +240,7 @@ class Link:
return latest
def canonical_outputs(self) -> Dict[str, Optional[str]]:
from util import wget_output_path
from .util import wget_output_path
canonical = {
'index_url': 'index.html',
'favicon_url': 'favicon.ico',

View file

@ -209,7 +209,7 @@
<center>
<small>
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$short_git_sha</a> &nbsp; | &nbsp;
version <a href="https://github.com/pirate/ArchiveBox/commit/$git_sha" title="Git commit">$version</a> &nbsp; | &nbsp;
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
<br/><br/>
$footer_info

View file

@ -25,8 +25,8 @@ from subprocess import (
from base32_crockford import encode as base32_encode
from schema import Link
from config import (
from .schema import Link
from .config import (
ANSI,
TERM_WIDTH,
SOURCES_DIR,
@ -37,9 +37,9 @@ from config import (
CHECK_SSL_VALIDITY,
WGET_USER_AGENT,
CHROME_OPTIONS,
PYTHON_PATH,
PYTHON_DIR,
)
from logs import pretty_path
from .logs import pretty_path
### Parsing Helpers
@ -334,7 +334,7 @@ def wget_output_path(link: Link) -> Optional[str]:
@enforce_types
def read_js_script(script_name: str) -> str:
script_path = os.path.join(PYTHON_PATH, 'scripts', script_name)
script_path = os.path.join(PYTHON_DIR, 'scripts', script_name)
with open(script_path, 'r') as f:
return f.read().split('// INFO BELOW HERE')[0].strip()