From e41f313fa3522c888b63b353cd5622dfbea7e573 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Sun, 11 Sep 2022 12:31:11 +0200 Subject: [PATCH 01/10] Change actions --- .github/workflows/codeql-analysis.yml | 8 +------- .github/workflows/lint.yml | 4 +--- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 66e331b2..0eb2db0e 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,12 +1,6 @@ name: "CodeQL" -on: - push: - branches: [ dev ] - pull_request: - branches: [ dev ] - schedule: - - cron: '43 1 * * 2' +on: [push] jobs: analyze: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 5a402b25..28375994 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,8 +1,6 @@ name: Run linters -on: - workflow_dispatch: - push: +on: [push] env: MAX_LINE_LENGTH: 110 From f5f7aff3b4dec51a001110227946f083897dae72 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 20:34:02 +0000 Subject: [PATCH 02/10] Added yt-dlp everywhere --- .gitignore | 3 +++ Dockerfile | 2 +- README.md | 6 +++--- archivebox/config.py | 3 ++- archivebox/extractors/__init__.py | 1 + archivebox/extractors/media.py | 1 + bin/setup.sh | 8 ++++---- etc/ArchiveBox.conf.default | 2 +- setup.py | 1 + stdeb.cfg | 2 +- 10 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index a80c30ba..f8fefbfb 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ data1/ data2/ data3/ output/ + +# vim +*.sw? diff --git a/Dockerfile b/Dockerfile index 7d422628..e147e56e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # This is the Dockerfile for ArchiveBox, it bundles the following dependencies: -# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, single-file +# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file # Usage: # docker build . -t archivebox --no-cache # docker run -v "$PWD/data":/data archivebox init diff --git a/README.md b/README.md index 46427e04..039dd6bb 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ ls ./archive/*/index.json # or browse directly via the filesyste - [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up online, stores all data locally - [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies) - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) -- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl), articles (readability), code (git), etc.](#output-formats) +- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) - [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, and WARC - [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA) @@ -469,7 +469,7 @@ Inside each Snapshot folder, ArchiveBox save these different types of extractor - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome - **Article Text:** `article.html/json` Article text extraction using Readability & Mercury - **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org -- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl +- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp) - **Source Code:** `git/` clone of any repository found on GitHub, Bitbucket, or GitLab links - _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ @@ -529,7 +529,7 @@ To achieve high fidelity archives in as many situations as possible, ArchiveBox - `node` & `npm` (for readability, mercury, and singlefile) - `wget` (for plain HTML, static files, and WARC saving) - `curl` (for fetching headers, favicon, and posting to Archive.org) -- `youtube-dl` (for audio, video, and subtitles) +- `youtube-dl` or `yt-dlp` (for audio, video, and subtitles) - `git` (for cloning git repos) - and more as we grow... diff --git a/archivebox/config.py b/archivebox/config.py index 9744cd16..4d839805 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -203,7 +203,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')}, 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')}, 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')}, - 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, + #'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, + 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'CHROME_BINARY': {'type': str, 'default': None}, diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index ce2ff365..2f5b3b73 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' import os +import sys from pathlib import Path from typing import Optional, List, Iterable, Union diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 1b093e8a..17e7a6a6 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -72,6 +72,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME timer.end() # add video description and subtitles to full-text index + # Let's try a few different index_texts = [ text_file.read_text(encoding='utf-8').strip() for text_file in ( diff --git a/bin/setup.sh b/bin/setup.sh index 5f4b4103..37d7937c 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -91,9 +91,9 @@ echo " This is a helper script which installs the ArchiveBox dependencies on echo " You may be prompted for a sudo password in order to install the following:" echo "" echo " - archivebox" -echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" -echo " - curl, wget, git, youtube-dl (used for extracting title, favicon, git, media, and more)" -echo " - chromium (skips this if any Chrome/Chromium version is already installed)" +echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" +echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)" +echo " - chromium (skips this if any Chrome/Chromium version is already installed)" echo "" echo " If you'd rather install these manually as-needed, you can find detailed documentation here:" echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install" @@ -115,7 +115,7 @@ if which apt-get > /dev/null; then fi echo echo "[+] Installing ArchiveBox system dependencies using apt..." - sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl ffmpeg git nodejs npm ripgrep + sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl yt-dlp ffmpeg git nodejs npm ripgrep sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true sudo apt-get install -y archivebox sudo apt-get --only-upgrade install -y archivebox diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 982a1931..03048a42 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -55,7 +55,7 @@ # CURL_BINARY = curl # GIT_BINARY = git # WGET_BINARY = wget -# YOUTUBEDL_BINARY = youtube-dl +# YOUTUBEDL_BINARY = yt-dlp # CHROME_BINARY = chromium # CHROME_USER_DATA_DIR="~/.config/google-chrome/Default" diff --git a/setup.py b/setup.py index a9d8a509..346d3b62 100755 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ INSTALL_REQUIRES = [ "django-extensions>=3.0.3", "dateparser>=1.0.0", "youtube-dl>=2021.04.17", + "yt-dlp>=2021.4.11", "python-crontab>=2.5.1", "croniter>=0.3.34", "w3lib>=1.22.0", diff --git a/stdeb.cfg b/stdeb.cfg index 6664c6c7..571d4245 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -5,7 +5,7 @@ Package3: archivebox Suite: focal Suite3: focal Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb -Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep +Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep X-Python3-Version: >= 3.7 XS-Python-Version: >= 3.7 Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck From dba423a56865e14741c83af94d0222a263f3e6aa Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 20:36:23 +0000 Subject: [PATCH 03/10] A few more youtube-dl tweaks --- archivebox/config.py | 2 ++ archivebox/extractors/media.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 4d839805..d5666e87 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -149,6 +149,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--yes-playlist', '--continue', '--ignore-errors', + # This flag doesn't exist in youtube-dl + # only in yt-dlp '--no-abort-on-error', '--geo-bypass', '--add-metadata', diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 17e7a6a6..c6388a1f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio @enforce_types def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: - """Download playlists or individual video, audio, and subtitles using youtube-dl""" + """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" out_dir = out_dir or Path(link.link_dir) output: ArchiveOutput = 'media' @@ -61,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME pass else: hints = ( - 'Got youtube-dl response code: {}.'.format(result.returncode), + 'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode), *result.stderr.decode().split('\n'), ) raise ArchiveError('Failed to save media', hints) From b864c38d9e9eb661bde0c0c267784b71cb8bd95e Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 20:40:45 +0000 Subject: [PATCH 04/10] Don't be strict on unicode errors --- archivebox/extractors/media.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index c6388a1f..7d73024f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -74,7 +74,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME # add video description and subtitles to full-text index # Let's try a few different index_texts = [ - text_file.read_text(encoding='utf-8').strip() + # errors: + # * 'strict' to raise a ValueError exception if there is an + # encoding error. The default value of None has the same effect. + # * 'ignore' ignores errors. Note that ignoring encoding errors + # can lead to data loss. + # * 'xmlcharrefreplace' is only supported when writing to a + # file. Characters not supported by the encoding are replaced with + # the appropriate XML character reference &#nnn;. + # There are a few more options described in https://docs.python.org/3/library/functions.html#open + text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip() for text_file in ( *output_path.glob('*.description'), *output_path.glob('*.srt'), From 983f485cc07e4d241718a063e8a835e583670f4e Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 21:29:43 +0000 Subject: [PATCH 05/10] flake8 --- archivebox/extractors/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 2f5b3b73..3fe58082 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -128,7 +128,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) stats['skipped'] += 1 - except Exception as e: + except Exception: # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984 # and https://github.com/ArchiveBox/ArchiveBox/issues/1014 # are fixed. @@ -138,7 +138,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s link.url, )) from e """ - # Instead, use the kludgy workaround from + # Instead, use the kludgy workaround from # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627 with open(ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) @@ -147,7 +147,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s method_name, link.url, ) + "\n")) - #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") + f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") # print(' ', stats) From daef48e59bb534709aa72627532f46813c9db9df Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 21:31:33 +0000 Subject: [PATCH 06/10] flake8 --- archivebox/extractors/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 3fe58082..e9d1347b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -143,11 +143,12 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s with open(ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={}))'.format( + f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}'.format( method_name, link.url, + command ) + "\n")) - f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") + #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") # print(' ', stats) From 081a12b0799b1a585c2274e8a033a2c1151d4248 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 21:32:47 +0000 Subject: [PATCH 07/10] Add ts --- archivebox/extractors/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index e9d1347b..8623a15b 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -143,10 +143,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s with open(ERROR_LOG, "a", encoding='utf-8') as f: command = ' '.join(sys.argv) ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}'.format( + f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format( method_name, link.url, - command + command, + ts ) + "\n")) #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n") From 4ce392846cebb7471c8fc6cb919a48f50d2a4e79 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 21:51:15 +0000 Subject: [PATCH 08/10] Fix actions --- .github/workflows/codeql-analysis.yml | 8 +++++++- .github/workflows/lint.yml | 4 +++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 0eb2db0e..66e331b2 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,6 +1,12 @@ name: "CodeQL" -on: [push] +on: + push: + branches: [ dev ] + pull_request: + branches: [ dev ] + schedule: + - cron: '43 1 * * 2' jobs: analyze: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 28375994..5a402b25 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,6 +1,8 @@ name: Run linters -on: [push] +on: + workflow_dispatch: + push: env: MAX_LINE_LENGTH: 110 From caa8b782fbceeb7913246d4dbb23272d32a2eee5 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 21:52:01 +0000 Subject: [PATCH 09/10] Remove tab --- bin/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/setup.sh b/bin/setup.sh index 37d7937c..395b43f9 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -91,7 +91,7 @@ echo " This is a helper script which installs the ArchiveBox dependencies on echo " You may be prompted for a sudo password in order to install the following:" echo "" echo " - archivebox" -echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" +echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)" echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)" echo " - chromium (skips this if any Chrome/Chromium version is already installed)" echo "" From f729bbe122c96fb1fbaf0d0f39cae30aa513b5d0 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Wed, 14 Sep 2022 06:27:58 +0200 Subject: [PATCH 10/10] yt-dlp fixes --- archivebox/config.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 9744cd16..b6f7e7c1 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -144,12 +144,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--no-call-home', '--write-sub', '--all-subs', - '--write-auto-sub', + # There are too many of these and youtube + # throttles you with HTTP error 429 + #'--write-auto-sub', '--convert-subs=srt', '--yes-playlist', '--continue', - '--ignore-errors', '--no-abort-on-error', + # --ignore-errors must come AFTER + # --no-abort-on-error + # https://github.com/yt-dlp/yt-dlp/issues/4914 + '--ignore-errors', '--geo-bypass', '--add-metadata', '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),