From 57df65f28fe72677d21e29ce6c9e9d7bcd0ba918 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 21 Apr 2022 07:09:17 -0700 Subject: [PATCH] use yt-dlp for media archiving instead of youtube-dl --- Dockerfile | 5 +++-- archivebox/config.py | 3 ++- archivebox/extractors/media.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2ee6cbbb..4a63fd34 100644 --- a/Dockerfile +++ b/Dockerfile @@ -81,7 +81,8 @@ RUN apt-get update -qq \ build-essential python-dev python3-dev \ && echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \ && python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \ - && pip install --quiet -r /tmp/requirements.txt \ + && pip install -r /tmp/requirements.txt \ + && pip install --upgrade youtube-dl yt-dlp \ && apt-get purge -y build-essential python-dev python3-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* @@ -104,7 +105,7 @@ RUN pip install -e . WORKDIR "$DATA_DIR" ENV IN_DOCKER=True \ CHROME_SANDBOX=False \ - CHROME_BINARY="chromium" \ + CHROME_BINARY="/usr/bin/chromium-browser" \ USE_SINGLEFILE=True \ SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \ USE_READABILITY=True \ diff --git a/archivebox/config.py b/archivebox/config.py index b4bcb7fc..8abe7f00 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -142,6 +142,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { '--yes-playlist', '--continue', '--ignore-errors', + '--no-abort-on-error', '--geo-bypass', '--add-metadata', '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']), @@ -929,7 +930,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue: 'TIMEOUT': config['TIMEOUT'], 'RESOLUTION': config['RESOLUTION'], 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'], - 'CHROME_BINARY': config['CHROME_BINARY'], + 'CHROME_BINARY': bin_path(config['CHROME_BINARY']), 'CHROME_HEADLESS': config['CHROME_HEADLESS'], 'CHROME_SANDBOX': config['CHROME_SANDBOX'], 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'], diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index e41a4002..1b093e8a 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME YOUTUBEDL_BINARY, *YOUTUBEDL_ARGS, *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']), + # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR} link.url, ] status = 'succeeded'