1
0
Fork 0

Merge remote-tracking branch 'origin/HEAD' into changes

This commit is contained in:
Alex Kotov 2023-09-17 00:19:05 +04:00
commit 36ab7f112b
Signed by: kotovalexarian
GPG Key ID: 553C0EBBEB5D5F08
11 changed files with 2334 additions and 1152 deletions

View File

@ -87,12 +87,12 @@ ADD "./setup.py" "$CODE_DIR/"
ADD "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev \
build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install -r /tmp/requirements.txt \
&& pip install --upgrade youtube-dl yt-dlp \
&& apt-get purge -y build-essential python-dev python3-dev \
&& apt-get purge -y build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*

View File

@ -57,9 +57,17 @@ SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
import pwd
SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
except KeyError:
# Process' UID might not map to a user in cases such as running the Docker image
# (where `archivebox` is 999) as a different UID.
pass
except ModuleNotFoundError:
# pwd is only needed for some linux systems, doesn't exist on windows
pass
except Exception:
# this should never happen, uncomment to debug
# raise
pass
############################### Config Schema ##################################
@ -109,6 +117,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'PREVIEW_MEDIA': {'type': bool, 'default': True},
'PREVIEW_GIT': {'type': bool, 'default': True},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'LDAP': {'type': bool, 'default': False},
'LDAP_SERVER_URI': {'type': str, 'default': None},
'LDAP_BIND_DN': {'type': str, 'default': None},
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
'LDAP_USER_BASE': {'type': str, 'default': None},
'LDAP_USER_FILTER': {'type': str, 'default': None},
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -152,10 +171,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--all-subs',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-subs',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
@ -168,7 +184,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
]},
@ -229,6 +245,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
},
}

View File

@ -6,6 +6,9 @@ import re
import logging
import tempfile
import ldap
from django_auth_ldap.config import LDAPSearch
from pathlib import Path
from django.utils.crypto import get_random_string
@ -20,6 +23,17 @@ from ..config import (
OUTPUT_DIR,
LOGS_DIR,
TIMEZONE,
LDAP,
LDAP_SERVER_URI,
LDAP_BIND_DN,
LDAP_BIND_PASSWORD,
LDAP_USER_BASE,
LDAP_USER_FILTER,
LDAP_USERNAME_ATTR,
LDAP_FIRSTNAME_ATTR,
LDAP_LASTNAME_ATTR,
LDAP_EMAIL_ATTR,
)
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@ -55,6 +69,12 @@ INSTALLED_APPS = [
]
# For usage with https://www.jetadmin.io/integrations/django
# INSTALLED_APPS += ['jet_django']
# JET_PROJECT = 'archivebox'
# JET_TOKEN = 'some-api-token-here'
MIDDLEWARE = [
'core.middleware.TimezoneMiddleware',
'django.middleware.security.SecurityMiddleware',
@ -67,11 +87,48 @@ MIDDLEWARE = [
'core.middleware.CacheControlMiddleware',
]
################################################################################
### Authentication Settings
################################################################################
AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.RemoteUserBackend',
'django.contrib.auth.backends.ModelBackend',
]
if LDAP:
global AUTH_LDAP_SERVER_URI
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
global AUTH_LDAP_BIND_DN
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
global AUTH_LDAP_BIND_PASSWORD
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
global AUTH_LDAP_USER_ATTR_MAP
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
################################################################################
### Debug Settings
################################################################################
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
if DEBUG_TOOLBAR:
@ -267,8 +324,8 @@ class NoisyRequestsFilter(logging.Filter):
if LOGS_DIR.exists():
ERROR_LOG = (LOGS_DIR / 'errors.log')
else:
# meh too many edge cases here around creating log dir w/ correct permissions
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
# historically too many edge cases here around creating log dir w/ correct permissions early on
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
ERROR_LOG = tempfile.NamedTemporaryFile().name
LOGGING = {

View File

@ -33,6 +33,9 @@ urlpatterns = [
path('admin/', admin.site.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda _: 1/0),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
path('index.html', RedirectView.as_view(url='/')),
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),

View File

@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
for line in list(hints)[:5] if line.strip()
)

View File

@ -34,6 +34,7 @@ from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from . import pocket_api
from . import readwise_reader_api
from . import wallabag_atom
from . import pocket_html
from . import pinboard_rss
@ -51,6 +52,7 @@ from . import url_list
PARSERS = {
# Specialized parsers
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),

View File

@ -17,7 +17,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
# sometimes the first line is a comment or filepath, so we get everything after the first {
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
links = json.loads(json_file_json_str)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:

View File

@ -0,0 +1,123 @@
__package__ = "archivebox.parsers"
import re
import requests
from datetime import datetime
from typing import IO, Iterable, Optional
from configparser import ConfigParser
from pathlib import Path
from ..index.schema import Link
from ..util import enforce_types
from ..system import atomic_write
from ..config import (
SOURCES_DIR,
READWISE_READER_TOKENS,
)
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
class ReadwiseReaderAPI:
cursor: Optional[str]
def __init__(self, api_token, cursor=None) -> None:
self.api_token = api_token
self.cursor = cursor
def get_archive(self):
response = requests.get(
url="https://readwise.io/api/v3/list/",
headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
params={
"location": "archive",
"pageCursor": self.cursor,
}
)
response.raise_for_status()
return response
def get_readwise_reader_articles(api: ReadwiseReaderAPI):
response = api.get_archive()
body = response.json()
articles = body["results"]
yield from articles
if body['nextPageCursor']:
api.cursor = body["nextPageCursor"]
yield from get_readwise_reader_articles(api)
def link_from_article(article: dict, sources: list):
url: str = article['source_url']
title = article["title"] or url
timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
return Link(
url=url,
timestamp=str(timestamp),
title=title,
tags="",
sources=sources,
)
def write_cursor(username: str, since: str):
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(API_DB_PATH)
since_file[username] = {"since": since}
with open(API_DB_PATH, "w+") as new:
since_file.write(new)
def read_cursor(username: str) -> Optional[str]:
if not API_DB_PATH.exists():
atomic_write(API_DB_PATH, "")
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(API_DB_PATH)
return config_file.get(username, "since", fallback=None)
@enforce_types
def should_parse_as_readwise_reader_api(text: str) -> bool:
return text.startswith("readwise-reader://")
@enforce_types
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Readwise Reader API"""
input_buffer.seek(0)
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_readwise_reader_api(line):
username = pattern.search(line).group(1)
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
for article in get_readwise_reader_articles(api):
yield link_from_article(article, sources=[line])
if api.cursor:
write_cursor(username, api.cursor)
KEY = "readwise_reader_api"
NAME = "Readwise Reader API"
PARSER = parse_readwise_reader_api_export

3254
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,8 @@
"license": "MIT",
"dependencies": {
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
"playwright": "^1.37.1",
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
"single-file-cli": "^1.0.63"
}
}

View File

@ -47,6 +47,7 @@ INSTALL_REQUIRES = [
"croniter>=0.3.34",
"w3lib>=1.22.0",
"ipython>5.0.0",
"django-auth-ldap>=4.1.0"
]
EXTRAS_REQUIRE = {
'sonic': [