1
0
Fork 0

Merge branch 'dev' into search_index_extract_html_text

This commit is contained in:
Nick Sweeting 2023-10-27 23:09:28 -07:00 committed by GitHub
commit a680724367
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 3230 additions and 1654 deletions

View file

@ -5,16 +5,21 @@ __pycache__/
.mypy_cache/ .mypy_cache/
.pytest_cache/ .pytest_cache/
.github/ .github/
.git/
.pdm-build/
.pdm-python/
.eggs/
venv/ venv/
.venv/ .venv/
.docker-venv/ .docker-venv/
node_modules/
build/ build/
dist/ dist/
pip_dist/
!pip_dist/archivebox.egg-info/requires.txt
brew_dist/ brew_dist/
deb_dist/
pip_dist/
assets/ assets/
data/ data/

View file

@ -7,7 +7,7 @@ on:
jobs: jobs:
build: build:
runs-on: ubuntu-20.04 runs-on: ubuntu-22.04
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -18,7 +18,7 @@ jobs:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v1 uses: actions/setup-python@v1
with: with:
python-version: 3.9 python-version: 3.11
architecture: x64 architecture: x64
- name: Build Python Package - name: Build Python Package

2
.gitignore vendored
View file

@ -13,6 +13,8 @@ venv/
node_modules/ node_modules/
# Packaging artifacts # Packaging artifacts
.pdm-python
.pdm-build
archivebox.egg-info archivebox.egg-info
archivebox-*.tar.gz archivebox-*.tar.gz
build/ build/

View file

@ -12,19 +12,21 @@
# docker buildx create --use # docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev # docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
# #
# Read more about [developing # Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
FROM python:3.11-slim-bullseye FROM debian:bookworm-backports
# Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
LABEL name="archivebox" \ LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \ maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one personal internet archiving container" \ description="All-in-one personal internet archiving container" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \ homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker" documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
# System-level base config ######### Base System Setup ####################################
# Global system-level config
ENV TZ=UTC \ ENV TZ=UTC \
LANGUAGE=en_US:en \ LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \ LC_ALL=C.UTF-8 \
@ -32,103 +34,156 @@ ENV TZ=UTC \
PYTHONIOENCODING=UTF-8 \ PYTHONIOENCODING=UTF-8 \
PYTHONUNBUFFERED=1 \ PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \ DEBIAN_FRONTEND=noninteractive \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
npm_config_loglevel=error
# Application-level base config # Application-level config
ENV CODE_DIR=/app \ ENV CODE_DIR=/app \
VENV_PATH=/venv \
DATA_DIR=/data \ DATA_DIR=/data \
NODE_DIR=/node \ GLOBAL_VENV=/venv \
APP_VENV=/app/.venv \
NODE_MODULES=/app/node_modules \
ARCHIVEBOX_USER="archivebox" ARCHIVEBOX_USER="archivebox"
ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
SHELL ["/bin/bash", "-c"]
ARG TARGETPLATFORM
ARG TARGETARCH
ARG TARGETVARIANT
RUN printf "[i] Building for TARGETPLATFORM=${TARGETPLATFORM}" \
&& printf ", TARGETARCH=${TARGETARCH}" \
&& printf ", TARGETVARIANT=${TARGETVARIANT} \n" \
&& printf "uname -a : " && uname -a
# Create non-privileged user for archivebox and chrome # Create non-privileged user for archivebox and chrome
RUN groupadd --system $ARCHIVEBOX_USER \ RUN echo "[*] Setting up system environment..." \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER && groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
&& mkdir -p /etc/apt/keyrings
# Install system dependencies # Install system apt dependencies (adding backports to access more recent apt updates)
RUN apt-get update -qq \ RUN echo "[+] Installing system dependencies..." \
&& apt-get install -qq -y --no-install-recommends \ && echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
apt-transport-https ca-certificates gnupg2 zlib1g-dev \ && apt-get update -qq \
dumb-init gosu cron unzip curl \ && apt-get install -qq -y \
apt-transport-https ca-certificates gnupg2 curl wget \
zlib1g-dev dumb-init gosu cron unzip \
# nano iputils-ping dnsutils htop procps \
# 1. packaging dependencies
# 2. docker and init system dependencies
# 3. frivolous CLI helpers to make debugging failed archiving easier
&& mkdir -p /etc/apt/keyrings \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install apt dependencies
RUN apt-get update -qq \ ######### Language Environments ####################################
&& apt-get install -qq -y --no-install-recommends \
wget curl chromium git ffmpeg youtube-dl ripgrep \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
&& rm -rf /var/lib/apt/lists/*
# Install Node environment # Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ RUN echo "[+] Installing Node environment..." \
&& echo 'deb https://deb.nodesource.com/node_18.x buster main' >> /etc/apt/sources.list \ && echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_21.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \ && apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \ && apt-get install -qq -y nodejs libatomic1 \
nodejs \ && npm i -g npm \
# && npm install -g npm \ && node --version \
&& npm --version
# Install Python environment
RUN echo "[+] Installing Python environment..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \
python3-ldap libldap2-dev libsasl2-dev libssl-dev python3-msgpack \
&& rm /usr/lib/python3*/EXTERNALLY-MANAGED \
&& python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
&& $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel python-ldap \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
######### Extractor Dependencies ##################################
# Install apt dependencies
RUN echo "[+] Installing extractor APT dependencies..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
# Install chromium browser using playwright
ENV PLAYWRIGHT_BROWSERS_PATH="/browsers"
RUN echo "[+] Installing extractor Chromium dependency..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install playwright \
&& $GLOBAL_VENV/bin/playwright install --with-deps chromium \
&& CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config" \
|| if [[ "$TARGETPLATFORM" == "linux/arm/v7" ]]; then exit 0; else exit 1; fi
# ignore failure for architectures where no playwright release is available yet
# Install Node dependencies # Install Node dependencies
WORKDIR "$NODE_DIR"
ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \
npm_config_loglevel=error
ADD ./package.json ./package.json
ADD ./package-lock.json ./package-lock.json
RUN npm ci
# Install Python dependencies
WORKDIR "$CODE_DIR" WORKDIR "$CODE_DIR"
ENV PATH="${PATH}:$VENV_PATH/bin" COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR/"
RUN python -m venv --clear --symlinks "$VENV_PATH" \ RUN echo "[+] Installing extractor Node dependencies..." \
&& pip install --upgrade --quiet pip setuptools \ && npm ci --prefer-offline --no-audit \
&& mkdir -p "$CODE_DIR/archivebox" && npm version
ADD "./setup.py" "$CODE_DIR/"
ADD "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install -r /tmp/requirements.txt \
&& pip install --upgrade youtube-dl yt-dlp \
&& apt-get purge -y build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
# Install apt development dependencies ######### Build Dependencies ####################################
# RUN apt-get install -qq \
# && apt-get install -qq -y --no-install-recommends \
# python3 python3-dev python3-pip python3-venv python3-all \
# dh-python debhelper devscripts dput software-properties-common \
# python3-distutils python3-setuptools python3-wheel python3-stdeb
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
# && pip install --quiet -r /tmp/dev_requirements.txt
# Install ArchiveBox Python package and its dependencies # # Building ArchiveBox from source with all pdm dev dependencies
WORKDIR "$CODE_DIR" # WORKDIR "$CODE_DIR"
ADD . "$CODE_DIR" # COPY --chown=root:root --chmod=755 "./pyproject.toml" "./pdm.lock" "$CODE_DIR/"
RUN chown -R root:root . && chmod a+rX -R . && pip install -e . # RUN echo "[+] Installing project Python dependencies..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# && pdm use -f $GLOBAL_VENV \
# && pdm install --fail-fast --no-lock --group :all --no-self \
# && pdm build \
# && apt-get purge -y \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# # these are only needed to build CPython libs, we discard after build phase to shrink layer size
# && apt-get autoremove -y \
# && rm -rf /var/lib/apt/lists/*
# Install ArchiveBox Python package from source
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN echo "[*] Installing ArchiveBox package from /app..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap]
####################################################
# Setup ArchiveBox runtime config # Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR" WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \ ENV IN_DOCKER=True \
WGET_BINARY="wget" \
YOUTUBEDL_BINARY="yt-dlp" \
CHROME_SANDBOX=False \ CHROME_SANDBOX=False \
CHROME_BINARY="/usr/bin/chromium-browser" \ CHROME_BINARY="/usr/bin/chromium-browser" \
USE_SINGLEFILE=True \ USE_SINGLEFILE=True \
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \ SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
USE_READABILITY=True \ USE_READABILITY=True \
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \ READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
USE_MERCURY=True \ USE_MERCURY=True \
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \ MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
YOUTUBEDL_BINARY="yt-dlp"
# Print version for nice docker finish summary # Print version for nice docker finish summary
# RUN archivebox version # RUN archivebox version
RUN /app/bin/docker_entrypoint.sh archivebox version RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \
&& uname -a | tee -a /version_info.txt \
&& env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \
&& env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \
&& "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt
####################################################
# Open up the interfaces to the outside world # Open up the interfaces to the outside world
VOLUME "$DATA_DIR" VOLUME "/data"
EXPOSE 8000 EXPOSE 8000
# Optional: # Optional:

View file

@ -10,7 +10,7 @@
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a> <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
<pre lang="bash"><code style="white-space: pre-line">"Your own personal internet archive" (网站存档 / 爬虫) <pre lang="bash" align="center"><code style="white-space: pre-line; text-align: center" align="center">"Your own personal internet archive" (网站存档 / 爬虫)
curl -sSL 'https://get.archivebox.io' | sh curl -sSL 'https://get.archivebox.io' | sh
</code></pre> </code></pre>
@ -588,7 +588,8 @@ Each snapshot subfolder `./archive/<timestamp>/` includes a static `index.json`
You can export the main index to browse it statically without needing to run a server. You can export the main index to browse it statically without needing to run a server.
*Note about large exports: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.* > **Note**
> These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.
```bash ```bash
# archivebox list --help # archivebox list --help
@ -615,7 +616,7 @@ The paths in the static exports are relative, make sure to keep them next to you
### Archiving Private Content ### Archiving Private Content
<a id="archiving-private-urls"/> <a id="archiving-private-urls"></a>
If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**. If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**.
@ -985,6 +986,7 @@ archivebox init --setup
<details><summary><i>Click to expand...</i></summary> <details><summary><i>Click to expand...</i></summary>
Make sure to run this whenever you change things in `models.py`. Make sure to run this whenever you change things in `models.py`.
```bash ```bash
cd archivebox/ cd archivebox/
./manage.py makemigrations ./manage.py makemigrations
@ -993,6 +995,7 @@ cd path/to/test/data/
archivebox shell archivebox shell
archivebox manage dbshell archivebox manage dbshell
``` ```
(uses `pytest -s`) (uses `pytest -s`)
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
@ -1000,7 +1003,9 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj
#### Contributing a new extractor #### Contributing a new extractor
<details><summary><i>Click to expand...</i></summary><br/><br/> <details><summary><i>Click to expand...</i></summary>
<br/><br/>
ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page. ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.

34
SECURITY.md Normal file
View file

@ -0,0 +1,34 @@
# Security Policy
---
## Security Information
Please see this wiki page for important notices about ArchiveBox security, publishing your archives securely, and the dangers of executing archived JS:
https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview
Also see this section of the README about important caveats when running ArchiveBox:
https://github.com/ArchiveBox/ArchiveBox?tab=readme-ov-file#caveats
You can also read these pages for more information about ArchiveBox's internals, development environment, DB schema, and more:
- https://github.com/ArchiveBox/ArchiveBox#archive-layout
- https://github.com/ArchiveBox/ArchiveBox#archivebox-development
- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting
---
## Reporting a Vulnerability
We use Github's built-in [Private Reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature to accept vulnerability reports.
1. Go to the Security tab on our Github repo: https://github.com/ArchiveBox/ArchiveBox/security
2. Click the ["Report a Vulnerability"](https://github.com/ArchiveBox/ArchiveBox/security/advisories/new) button
3. Fill out the form to submit the details of the report and it will be securely sent to the maintainers
You can also contact the maintainers via our public [Zulip Chat Server zulip.archivebox.io](https://zulip.archivebox.io) or [Twitter DMs @ArchiveBoxApp](https://twitter.com/ArchiveBoxApp).

View file

@ -90,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_TIMEOUT': {'type': int, 'default': 3600}, 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'}, 'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'}, 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
'URL_WHITELIST': {'type': str, 'default': None}, 'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
'ADMIN_USERNAME': {'type': str, 'default': None},
'ADMIN_PASSWORD': {'type': str, 'default': None},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'}, 'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
}, },
@ -143,6 +148,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)}, 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
'SAVE_DENYLIST': {'type': dict, 'default': {},},
}, },
'ARCHIVE_METHOD_OPTIONS': { 'ARCHIVE_METHOD_OPTIONS': {
@ -231,12 +238,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'}, 'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'}, 'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'NODE_BINARY': {'type': str, 'default': 'node'}, 'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None}, 'CHROME_BINARY': {'type': str, 'default': None},
@ -374,6 +380,8 @@ def get_commit_hash(config):
############################## Derived Config ################################## ############################## Derived Config ##################################
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns}, 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: SYSTEM_USER}, 'USER': {'default': lambda c: SYSTEM_USER},
@ -390,8 +398,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME}, 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()}, 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, 'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')}, 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@ -435,7 +443,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -465,10 +473,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)}, 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)}, 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)}, 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
} }
################################### Helpers #################################### ################################### Helpers ####################################

View file

@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False):
MEDIA_TIMEOUT: int MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str OUTPUT_PERMISSIONS: str
RESTRICT_FILE_NAMES: str RESTRICT_FILE_NAMES: str
URL_BLACKLIST: str URL_DENYLIST: str
SECRET_KEY: Optional[str] SECRET_KEY: Optional[str]
BIND_ADDR: str BIND_ADDR: str

View file

@ -41,7 +41,7 @@ class AddLinkForm(forms.Form):
# label="Exclude patterns", # label="Exclude patterns",
# min_length='1', # min_length='1',
# required=False, # required=False,
# initial=URL_BLACKLIST, # initial=URL_DENYLIST,
# ) # )
# timeout = forms.IntegerField( # timeout = forms.IntegerField(
# initial=TIMEOUT, # initial=TIMEOUT,

View file

@ -6,9 +6,6 @@ import re
import logging import logging
import tempfile import tempfile
import ldap
from django_auth_ldap.config import LDAPSearch
from pathlib import Path from pathlib import Path
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
@ -97,33 +94,43 @@ AUTHENTICATION_BACKENDS = [
] ]
if LDAP: if LDAP:
global AUTH_LDAP_SERVER_URI try:
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI import ldap
from django_auth_ldap.config import LDAPSearch
global AUTH_LDAP_BIND_DN global AUTH_LDAP_SERVER_URI
AUTH_LDAP_BIND_DN = LDAP_BIND_DN global AUTH_LDAP_BIND_DN
global AUTH_LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
global AUTH_LDAP_USER_ATTR_MAP
global AUTH_LDAP_BIND_PASSWORD AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD AUTH_LDAP_BIND_DN = LDAP_BIND_DN
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
global AUTH_LDAP_USER_ATTR_MAP AUTH_LDAP_USER_SEARCH = LDAPSearch(
AUTH_LDAP_USER_ATTR_MAP = { LDAP_USER_BASE,
'username': LDAP_USERNAME_ATTR, ldap.SCOPE_SUBTREE,
'first_name': LDAP_FIRSTNAME_ATTR, '(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
'last_name': LDAP_LASTNAME_ATTR, )
'email': LDAP_EMAIL_ATTR,
} AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
except ModuleNotFoundError:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
################################################################################ ################################################################################
### Debug Settings ### Debug Settings

View file

@ -4,12 +4,16 @@ import os
import sys import sys
from pathlib import Path from pathlib import Path
from typing import Optional, List, Iterable, Union from typing import Callable, Optional, List, Iterable, Union
from datetime import datetime, timezone from datetime import datetime, timezone
from django.db.models import QuerySet from django.db.models import QuerySet
from ..config import (
SAVE_ALLOWLIST_PTN,
SAVE_DENYLIST_PTN,
)
from ..core.settings import ERROR_LOG from ..core.settings import ERROR_LOG
from ..index.schema import Link from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index from ..index.sql import write_link_to_sql_index
from ..index import ( from ..index import (
load_link_details, load_link_details,
@ -43,7 +47,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers from .headers import should_save_headers, save_headers
def get_default_archive_methods(): ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
def get_default_archive_methods() -> List[ArchiveMethodEntry]:
return [ return [
('favicon', should_save_favicon, save_favicon), ('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers), ('headers', should_save_headers, save_headers),
@ -71,12 +79,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
('wget', 6) ('wget', 6)
] ]
@enforce_types @enforce_types
def ignore_methods(to_ignore: List[str]): def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
DEFAULT_METHODS = get_default_archive_methods()
allowed_methods = {
m for pat, methods in
SAVE_ALLOWLIST_PTN.items()
if pat.search(link.url)
for m in methods
} or { m[0] for m in DEFAULT_METHODS }
denied_methods = {
m for pat, methods in
SAVE_DENYLIST_PTN.items()
if pat.search(link.url)
for m in methods
}
allowed_methods -= denied_methods
return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
@enforce_types
def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
ARCHIVE_METHODS = get_default_archive_methods() ARCHIVE_METHODS = get_default_archive_methods()
methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
methods = map(lambda x: x[0], methods)
return list(methods)
@enforce_types @enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link: def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@ -89,11 +115,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
snapshot = write_link_to_sql_index(link) snapshot = write_link_to_sql_index(link)
ARCHIVE_METHODS = get_default_archive_methods() active_methods = get_archive_methods_for_link(link)
if methods: if methods:
ARCHIVE_METHODS = [ active_methods = [
method for method in ARCHIVE_METHODS method for method in active_methods
if method[0] in methods if method[0] in methods
] ]
@ -110,7 +136,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc) start_ts = datetime.now(timezone.utc)
for method_name, should_run, method_function in ARCHIVE_METHODS: for method_name, should_run, method_function in active_methods:
try: try:
if method_name not in link.history: if method_name not in link.history:
link.history[method_name] = [] link.history[method_name] = []

View file

@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
result = run(cmd, cwd=out_dir, timeout=timeout) result = run(cmd, cwd=out_dir, timeout=timeout)
try: try:
result_json = json.loads(result.stdout) result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError: except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [ output_tail = [
line.strip() line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip() if line.strip()
] ]
hints = ( hints = (

View file

@ -22,8 +22,8 @@ from ..config import (
JSON_INDEX_FILENAME, JSON_INDEX_FILENAME,
OUTPUT_DIR, OUTPUT_DIR,
TIMEOUT, TIMEOUT,
URL_BLACKLIST_PTN, URL_DENYLIST_PTN,
URL_WHITELIST_PTN, URL_ALLOWLIST_PTN,
stderr, stderr,
OUTPUT_PERMISSIONS OUTPUT_PERMISSIONS
) )
@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
continue continue
if scheme(link.url) not in ('http', 'https', 'ftp'): if scheme(link.url) not in ('http', 'https', 'ftp'):
continue continue
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url): if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
continue continue
if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)): if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
continue continue
yield link yield link

View file

@ -533,11 +533,27 @@ def log_shell_welcome_msg():
### Helpers ### Helpers
@enforce_types @enforce_types
def pretty_path(path: Union[Path, str]) -> str: def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = Path('.').resolve() pwd = str(Path(pwd)) # .resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir)) path = str(path)
return str(path).replace(str(pwd) + '/', './')
if not path:
return path
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/'):
path = path.replace(pwd, '.', 1)
# quote paths containing spaces
if ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the absolute path for clarity
if path == '.':
path = pwd
return path
@enforce_types @enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else: else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-' color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']: if folder['path']:
if Path(folder['path']).exists(): if Path(folder['path']).exists():
num_files = ( num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
# add symbol @ next to filecount if path is a remote filesystem mount # add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@' num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else '' path = pretty_path(folder['path'])
if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
return ' '.join(( return ' '.join((
ANSI[color], ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else: else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else '' path = pretty_path(dependency['path'])
if path and ' ' in path:
path = f'"{path}"'
return ' '.join(( return ' '.join((
ANSI[color], ANSI[color],

View file

@ -112,6 +112,8 @@ from .config import (
load_all_config, load_all_config,
CONFIG, CONFIG,
USER_CONFIG, USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
get_real_name, get_real_name,
setup_django, setup_django,
) )
@ -216,7 +218,7 @@ def version(quiet: bool=False,
if not quiet: if not quiet:
# 0.6.3 # 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY) # ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
p = platform.uname() p = platform.uname()
print( print(
@ -236,7 +238,8 @@ def version(quiet: bool=False,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}', f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}', f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}', f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
) )
print() print()
@ -251,19 +254,19 @@ def version(quiet: bool=False,
print() print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI)) print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items(): for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, folder)) print(printable_folder_status(name, path))
print() print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI)) print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, folder in EXTERNAL_LOCATIONS.items(): for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, folder)) print(printable_folder_status(name, path))
print() print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']: if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI)) print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items(): for name, path in DATA_LOCATIONS.items():
print(printable_folder_status(name, folder)) print(printable_folder_status(name, path))
else: else:
print() print()
print('{white}[i] Data locations:{reset}'.format(**ANSI)) print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
write_main_index(list(pending_links.values()), out_dir=out_dir) write_main_index(list(pending_links.values()), out_dir=out_dir)
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI)) print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
if existing_index: if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI)) print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else: else:
# TODO: allow creating new supersuer via env vars on first init
# if config.HTTP_USER and config.HTTP_PASS:
# from django.contrib.auth.models import User
# User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI)) print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
json_index = out_dir / JSON_INDEX_FILENAME json_index = out_dir / JSON_INDEX_FILENAME

View file

@ -1,62 +1,3 @@
{% extends "base.html" %}
{% load static %}
{% block body %}
<div id="toolbar">
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
<div>
<label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
<input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
<input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
<input type="button"
value="♺"
title="Refresh..."
onclick="location.href='{% url 'public-index' %}'"
style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
</input>
</div>
</form>
</div>
<table id="table-bookmarks">
<thead>
<tr>
<th style="width: 100px;">Bookmarked</th>
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
<th style="width: 140px">Files</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr>
</thead>
<tbody>
{% for link in object_list %}
{% include 'main_index_row.html' with link=link %}
{% endfor %}
</tbody>
</table>
<center>
<span class="step-links">
{% if page_obj.has_previous %}
<a href="{% url 'public-index' %}?page=1">&laquo; first</a>
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
{% endif %}
<span class="current">
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
<br>
</center>
{% endblock %}
{% extends "admin/base_site.html" %} {% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_list %} {% load i18n admin_urls static admin_list %}
{% load core_tags %} {% load core_tags %}

View file

@ -33,7 +33,7 @@
<br/> <br/>
<div class="loader"></div> <div class="loader"></div>
<br/> <br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress... Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
</center> </center>
</div> </div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %} <form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
</form> </form>
<br/><br/><br/> <br/><br/><br/>
<center id="delay-warning" style="display: none"> <center id="delay-warning" style="display: none">
<small>(it's safe to leave this page, adding will continue in the background)</small> <small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
</center> </center>
{% if absolute_add_path %} {% if absolute_add_path %}
<center id="bookmarklet"> <!-- <center id="bookmarklet">
<p>Bookmark this link to quickly add to your archive: <p>Bookmark this link to quickly add to your archive:
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p> <a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
</center> </center> -->
{% endif %} {% endif %}
<script> <script>
document.getElementById('add-form').addEventListener('submit', function(event) { document.getElementById('add-form').addEventListener('submit', function(event) {
document.getElementById('in-progress').style.display = 'block' document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none' document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block' document.getElementById('delay-warning').style.display = 'block'
setTimeout(function() {
window.location = '/'
}, 2000)
return true return true
}) })
</script> </script>

35
bin/build_dev.sh Executable file
View file

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# ./bin/build_docker.sh dev 'linux/arm/v7'
### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
set -o errexit
set -o errtrace
set -o nounset
set -o pipefail
IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
cd "$REPO_DIR"
which docker > /dev/null || exit 1
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
echo "[+] Building archivebox:$VERSION docker image..."
# docker builder prune
docker build . --no-cache -t archivebox-dev --load
# docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
# -t archivebox \
# -t archivebox:$TAG_NAME \
# -t archivebox:$VERSION \
# -t archivebox:$SHORT_VERSION

View file

@ -1,4 +1,5 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# ./bin/build_docker.sh dev 'linux/arm/v7'
### Bash Environment Setup ### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/ # http://redsymbol.net/articles/unofficial-bash-strict-mode/
@ -15,10 +16,12 @@ cd "$REPO_DIR"
which docker > /dev/null || exit 1 which docker > /dev/null || exit 1
TAG_NAME="dev" TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')" SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
REQUIRED_PLATFORMS=('linux/arm64','linux/amd64','linux/arm/v8','linux/arm/v7') REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
function check_platforms() { function check_platforms() {
INSTALLED_PLATFORMS="$(docker buildx inspect | grep 'Platforms:' )" INSTALLED_PLATFORMS="$(docker buildx inspect | grep 'Platforms:' )"
@ -33,44 +36,44 @@ function check_platforms() {
return 0 return 0
} }
function remove_builder() {
# remove existing xbuilder
docker buildx stop xbuilder || true
docker buildx rm xbuilder || true
}
function create_builder() { function create_builder() {
docker buildx use xbuilder && return 0
echo "[+] Creating new xbuilder for: $REQUIRED_PLATFORMS" echo "[+] Creating new xbuilder for: $REQUIRED_PLATFORMS"
echo echo
# Switch to buildx builder if already present / previously created # Switch to buildx builder if already present / previously created
docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$REQUIRED_PLATFORMS" || true docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$REQUIRED_PLATFORMS" || true
docker buildx inspect --bootstrap || true docker buildx inspect --bootstrap || true
echo
} }
function recreate_builder() { function recreate_builder() {
# Install QEMU binaries for cross-platform building if not installed # Install QEMU binaries for cross-platform building if not installed
docker run --privileged --rm 'tonistiigi/binfmt' --install all docker run --privileged --rm 'tonistiigi/binfmt' --install all
# remove existing xbuilder remove_builder
docker buildx stop xbuilder || true
docker buildx rm xbuilder || true
# Create Docker builder for cross-platform building
docker buildx use xbuilder && return 0
create_builder create_builder
} }
# Check if docker is ready for cross-plaform builds, if not, recreate builder # Check if docker is ready for cross-plaform builds, if not, recreate builder
docker buildx use xbuilder || create_builder docker buildx use xbuilder 2>&1 >/dev/null || create_builder
check_platforms || (recreate_builder && check_platforms) || exit 1 check_platforms || (recreate_builder && check_platforms) || exit 1
echo "[+] Building archivebox:$VERSION docker image..." echo "[+] Building archivebox:$VERSION docker image..."
#docker build . \ # docker builder prune
docker buildx build --platform "$REQUIRED_PLATFORMS" --push . \ # docker build . --no-cache -t archivebox-dev \
docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
-t archivebox \ -t archivebox \
-t archivebox:$TAG_NAME \ -t archivebox:$TAG_NAME \
-t archivebox:$VERSION \ -t archivebox:$VERSION \
-t archivebox:$SHORT_VERSION \ -t archivebox:$SHORT_VERSION \
-t archivebox:latest \
-t docker.io/nikisweeting/archivebox:$TAG_NAME \ -t docker.io/nikisweeting/archivebox:$TAG_NAME \
-t docker.io/nikisweeting/archivebox:$VERSION \ -t docker.io/nikisweeting/archivebox:$VERSION \
-t docker.io/nikisweeting/archivebox:$SHORT_VERSION \ -t docker.io/nikisweeting/archivebox:$SHORT_VERSION \

View file

@ -25,7 +25,10 @@ cd "$REPO_DIR"
rm -Rf build dist rm -Rf build dist
echo "[+] Building sdist, bdist_wheel, and egg_info" echo "[+] Building sdist, bdist_wheel, and egg_info"
python3 setup.py \ # python3 setup.py \
sdist --dist-dir=./pip_dist \ # sdist --dist-dir=./pip_dist \
bdist_wheel --dist-dir=./pip_dist \ # bdist_wheel --dist-dir=./pip_dist \
egg_info --egg-base=./pip_dist # egg_info --egg-base=./pip_dist
# pip install --upgrade pip setuptools build
python -m build

View file

@ -12,22 +12,26 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
fi fi
export PUID="$(id -u archivebox)"
export PGID="$(id -g archivebox)"
# Set the permissions of the data dir to match the archivebox user # Check the permissions of the data dir (or create if it doesn't exist)
if [[ -d "$DATA_DIR/archive" ]]; then if [[ -d "$DATA_DIR/archive" ]]; then
# check data directory permissions if touch "$DATA_DIR/archive/.permissions_test_safe_to_delete"; then
if [[ ! "$(stat -c %u $DATA_DIR/archive)" = "$(id -u archivebox)" ]]; then # It's fine, we are able to write to the data directory
echo "Change in ownership detected, please be patient while we chown existing files" rm "$DATA_DIR/archive/.permissions_test_safe_to_delete"
echo "This could take some time..." # echo "[√] Permissions are correct"
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER -R "$DATA_DIR" else
echo "[X] Permissions Error: ArchiveBox is not able to write to your data dir. You need to fix the data dir ownership and retry:" >2
echo " chown -R $PUID:$PGID data" >2
echo " https://docs.linuxserver.io/general/understanding-puid-and-pgid" >2
exit 1
fi fi
else else
# create data directory # create data directory
mkdir -p "$DATA_DIR/logs" mkdir -p "$DATA_DIR/logs"
chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
fi fi
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" "$DATA_DIR"/*
# Drop permissions to run commands as the archivebox user # Drop permissions to run commands as the archivebox user
if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then

View file

@ -34,6 +34,8 @@ services:
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# - PUID=1000 # set to your host user's UID & GID if you encounter permissions issues # - PUID=1000 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=1000 # - PGID=1000
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search # - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
# - SEARCH_BACKEND_HOST_NAME=sonic # - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword

1718
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -1,14 +1,13 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.6.3", "version": "0.7.0",
"description": "ArchiveBox: The self-hosted internet archive", "description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>", "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox", "repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git", "@postlight/parser": "^2.2.3",
"playwright": "^1.37.1",
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git", "readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
"single-file-cli": "^1.0.63" "single-file-cli": "^1.1.12"
} }
} }

2077
pdm.lock Normal file

File diff suppressed because it is too large Load diff

121
pyproject.toml Normal file
View file

@ -0,0 +1,121 @@
[project]
name = "archivebox"
version = "0.7.0"
description = "Self-hosted internet archiving solution."
authors = [
{name = "Nick Sweeting", email = "setup.py@archivebox.io"},
]
dependencies = [
"setuptools>=68.2.2",
"croniter>=0.3.34",
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
"requests>=2.24.0",
"w3lib>=1.22.0",
# "youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"playwright>=1.39.0",
]
requires-python = ">=3.9"
readme = "README.md"
license = {text = "MIT"}
classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Environment :: Web Environment",
"Framework :: Django",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Sociology :: History",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Utilities",
"Typing :: Typed",
]
# pdm lock -G:all
# pdm install -G:all
[tool.pdm.dev-dependencies]
build = [
"pdm",
"bottle",
"setuptools",
"stdeb",
"twine",
"wheel",
]
lint = [
"flake8",
"mypy",
"django-stubs",
]
test = [
"pytest",
]
debug = [
"django-debug-toolbar",
"djdt_flamegraph",
"ipdb",
]
doc = [
"recommonmark",
"sphinx",
"sphinx-rtd-theme",
]
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"django-auth-ldap>=4.1.0",
]
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[project.urls]
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
Source = "https://github.com/ArchiveBox/ArchiveBox"
Documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
"Bug Tracker" = "https://github.com/ArchiveBox/ArchiveBox/issues"
Changelog = "https://github.com/ArchiveBox/ArchiveBox/releases"
Roadmap = "https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap"
Community = "https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community"
Demo = "https://demo.archivebox.io"
Donate = "https://github.com/ArchiveBox/ArchiveBox/wiki/Donations"

266
setup.py
View file

@ -1,146 +1,150 @@
import json #####################################################################################
import setuptools # THIS FILE IS DEPRECATED AND WILL BE REMOVED EVENTUALLU
from setuptools.command.test import test # ALL FUTURE CHANGES SHOULD HAPPEN IN pyproject.toml with pdm
#####################################################################################
from pathlib import Path # import json
# import setuptools
# from setuptools.command.test import test
# from pathlib import Path
PKG_NAME = "archivebox" # PKG_NAME = "archivebox"
DESCRIPTION = "Self-hosted internet archiving solution." # DESCRIPTION = "Self-hosted internet archiving solution."
LICENSE = "MIT" # LICENSE = "MIT"
AUTHOR = "Nick Sweeting" # AUTHOR = "Nick Sweeting"
AUTHOR_EMAIL="git@nicksweeting.com" # AUTHOR_EMAIL="setup.py@archivebox.io"
REPO_URL = "https://github.com/ArchiveBox/ArchiveBox" # REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
PROJECT_URLS = { # PROJECT_URLS = {
"Source": f"{REPO_URL}", # "Source": f"{REPO_URL}",
"Documentation": f"{REPO_URL}/wiki", # "Documentation": f"{REPO_URL}/wiki",
"Bug Tracker": f"{REPO_URL}/issues", # "Bug Tracker": f"{REPO_URL}/issues",
"Changelog": f"{REPO_URL}/releases", # "Changelog": f"{REPO_URL}/releases",
"Roadmap": f"{REPO_URL}/wiki/Roadmap", # "Roadmap": f"{REPO_URL}/wiki/Roadmap",
"Community": f"{REPO_URL}/wiki/Web-Archiving-Community", # "Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
"Demo": f"https://demo.archivebox.io", # "Demo": f"https://demo.archivebox.io",
"Donate": f"{REPO_URL}/wiki/Donations", # "Donate": f"{REPO_URL}/wiki/Donations",
} # }
ROOT_DIR = Path(__file__).parent.resolve() # ROOT_DIR = Path(__file__).parent.resolve()
PACKAGE_DIR = ROOT_DIR / PKG_NAME # PACKAGE_DIR = ROOT_DIR / PKG_NAME
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore') # README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] # VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
PYTHON_REQUIRES = ">=3.7" # class DisabledTestCommand(test):
SETUP_REQUIRES = ["wheel"] # def run(self):
INSTALL_REQUIRES = [ # # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
# only add things here that have corresponding apt python3-packages available # print('\n[X] Running tests via setup.py test is deprecated.')
# anything added here also needs to be added to our package dependencies in # print(' Hint: Use the ./bin/test.sh script or pytest instead')
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests>=2.24.0",
"mypy-extensions>=0.4.3",
"django>=3.1.3,<3.2",
"django-extensions>=3.0.3",
"dateparser>=1.0.0",
"youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"python-crontab>=2.5.1",
"croniter>=0.3.34",
"w3lib>=1.22.0",
"ipython>5.0.0",
"django-auth-ldap>=4.1.0"
]
EXTRAS_REQUIRE = {
'sonic': [
"sonic-client>=0.0.5",
],
'dev': [
"setuptools",
"twine",
"wheel",
"flake8",
"ipdb",
"mypy",
"django-stubs",
"sphinx",
"sphinx-rtd-theme",
"recommonmark",
"pytest",
"bottle",
"stdeb",
"django-debug-toolbar",
"djdt_flamegraph",
],
}
# To see when setup.py gets called (uncomment for debugging): # To see when setup.py gets called (uncomment for debugging):
# import sys # import sys
# print(PACKAGE_DIR, f" (v{VERSION})") # print(PACKAGE_DIR, f" (v{VERSION})")
# print('>', sys.executable, *sys.argv) # print('>', sys.executable, *sys.argv)
# PYTHON_REQUIRES = ">=3.9"
# SETUP_REQUIRES = ["wheel"]
# INSTALL_REQUIRES = [
# # only add things here that have corresponding apt python3-packages available
# # anything added here also needs to be added to our package dependencies in
# # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# # if there is no apt python3-package equivalent, then vendor it instead in
# # ./archivebox/vendor/
# "requests>=2.24.0",
# "mypy-extensions>=0.4.3",
# "django>=3.1.3,<3.2",
# "django-extensions>=3.0.3",
# "dateparser>=1.0.0",
# "youtube-dl>=2021.04.17",
# "yt-dlp>=2021.4.11",
# "python-crontab>=2.5.1",
# "croniter>=0.3.34",
# "w3lib>=1.22.0",
# "ipython>5.0.0",
# ]
# EXTRAS_REQUIRE = {
# 'sonic': [
# "sonic-client>=0.0.5",
# ],
# 'ldap': [
# "django-auth-ldap>=4.1.0",
# ],
# 'dev': [
# "setuptools",
# "twine",
# "wheel",
# "flake8",
# "ipdb",
# "mypy",
# "django-stubs",
# "sphinx",
# "sphinx-rtd-theme",
# "recommonmark",
# "pytest",
# "bottle",
# "stdeb",
# "django-debug-toolbar",
# "djdt_flamegraph",
# ],
# }
#
# setuptools.setup(
# name=PKG_NAME,
# version=VERSION,
# license=LICENSE,
# author=AUTHOR,
# author_email=AUTHOR_EMAIL,
# description=DESCRIPTION,
# long_description=README,
# long_description_content_type="text/markdown",
# url=REPO_URL,
# project_urls=PROJECT_URLS,
# python_requires=PYTHON_REQUIRES,
# setup_requires=SETUP_REQUIRES,
# install_requires=INSTALL_REQUIRES,
# extras_require=EXTRAS_REQUIRE,
# packages=[PKG_NAME],
# include_package_data=True, # see MANIFEST.in
# entry_points={
# "console_scripts": [
# f"{PKG_NAME} = {PKG_NAME}.cli:main",
# ],
# },
# classifiers=[
# "License :: OSI Approved :: MIT License",
# "Natural Language :: English",
# "Operating System :: OS Independent",
# "Development Status :: 4 - Beta",
class DisabledTestCommand(test): # "Topic :: Utilities",
def run(self): # "Topic :: System :: Archiving",
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it # "Topic :: System :: Archiving :: Backup",
print() # "Topic :: System :: Recovery Tools",
print('[X] Running tests via setup.py test is deprecated.') # "Topic :: Sociology :: History",
print(' Hint: Use the ./bin/test.sh script or pytest instead') # "Topic :: Internet :: WWW/HTTP",
# "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
# "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
# "Topic :: Software Development :: Libraries :: Python Modules",
# "Intended Audience :: Developers",
setuptools.setup( # "Intended Audience :: Education",
name=PKG_NAME, # "Intended Audience :: End Users/Desktop",
version=VERSION, # "Intended Audience :: Information Technology",
license=LICENSE, # "Intended Audience :: Legal Industry",
author=AUTHOR, # "Intended Audience :: System Administrators",
author_email=AUTHOR_EMAIL,
description=DESCRIPTION,
long_description=README,
long_description_content_type="text/markdown",
url=REPO_URL,
project_urls=PROJECT_URLS,
python_requires=PYTHON_REQUIRES,
setup_requires=SETUP_REQUIRES,
install_requires=INSTALL_REQUIRES,
extras_require=EXTRAS_REQUIRE,
packages=[PKG_NAME],
include_package_data=True, # see MANIFEST.in
entry_points={
"console_scripts": [
f"{PKG_NAME} = {PKG_NAME}.cli:main",
],
},
classifiers=[
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Development Status :: 4 - Beta",
"Topic :: Utilities",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Sociology :: History",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Software Development :: Libraries :: Python Modules",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
"Environment :: Console", # "Environment :: Console",
"Environment :: Web Environment", # "Environment :: Web Environment",
"Programming Language :: Python :: 3", # "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7", # "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", # "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", # "Programming Language :: Python :: 3.9",
"Framework :: Django", # "Framework :: Django",
"Typing :: Typed", # "Typing :: Typed",
], # ],
cmdclass={ # cmdclass={
"test": DisabledTestCommand, # "test": DisabledTestCommand,
}, # },
) # )

View file

@ -6,6 +6,6 @@ Suite: focal
Suite3: focal Suite3: focal
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
X-Python3-Version: >= 3.7 X-Python3-Version: >= 3.9
XS-Python-Version: >= 3.7 XS-Python-Version: >= 3.9
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck

View file

@ -13,12 +13,51 @@ def test_ignore_methods():
Takes the passed method out of the default methods list and returns that value Takes the passed method out of the default methods list and returns that value
""" """
ignored = ignore_methods(['title']) ignored = ignore_methods(['title'])
assert should_save_title not in ignored assert "title" not in ignored
def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
allow_list = {
r'/static': ["headers", "singlefile"],
r'example\.com\.html$': ["headers"],
}
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_ALLOWLIST": pyjson.dumps(allow_list),
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_singlefile_works(tmp_path, process, disable_extractors_dict): def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"}) disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict) capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0] archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html" output_file = archived_item_path / "singlefile.html"
assert output_file.exists() assert output_file.exists()