Merge remote-tracking branch 'origin/HEAD' into changes
This commit is contained in:
commit
36ab7f112b
|
@ -87,12 +87,12 @@ ADD "./setup.py" "$CODE_DIR/"
|
|||
ADD "./package.json" "$CODE_DIR/archivebox/"
|
||||
RUN apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
build-essential python-dev python3-dev \
|
||||
build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
|
||||
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
|
||||
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
|
||||
&& pip install -r /tmp/requirements.txt \
|
||||
&& pip install --upgrade youtube-dl yt-dlp \
|
||||
&& apt-get purge -y build-essential python-dev python3-dev \
|
||||
&& apt-get purge -y build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
|
||||
&& apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
|
|
@ -57,9 +57,17 @@ SYSTEM_USER = getpass.getuser() or os.getlogin()
|
|||
try:
|
||||
import pwd
|
||||
SYSTEM_USER = pwd.getpwuid(os.geteuid()).pw_name or SYSTEM_USER
|
||||
except KeyError:
|
||||
# Process' UID might not map to a user in cases such as running the Docker image
|
||||
# (where `archivebox` is 999) as a different UID.
|
||||
pass
|
||||
except ModuleNotFoundError:
|
||||
# pwd is only needed for some linux systems, doesn't exist on windows
|
||||
pass
|
||||
except Exception:
|
||||
# this should never happen, uncomment to debug
|
||||
# raise
|
||||
pass
|
||||
|
||||
############################### Config Schema ##################################
|
||||
|
||||
|
@ -109,6 +117,17 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'PREVIEW_MEDIA': {'type': bool, 'default': True},
|
||||
'PREVIEW_GIT': {'type': bool, 'default': True},
|
||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||
|
||||
'LDAP': {'type': bool, 'default': False},
|
||||
'LDAP_SERVER_URI': {'type': str, 'default': None},
|
||||
'LDAP_BIND_DN': {'type': str, 'default': None},
|
||||
'LDAP_BIND_PASSWORD': {'type': str, 'default': None},
|
||||
'LDAP_USER_BASE': {'type': str, 'default': None},
|
||||
'LDAP_USER_FILTER': {'type': str, 'default': None},
|
||||
'LDAP_USERNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_FIRSTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_LASTNAME_ATTR': {'type': str, 'default': None},
|
||||
'LDAP_EMAIL_ATTR': {'type': str, 'default': None},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -152,10 +171,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--all-subs',
|
||||
# There are too many of these and youtube
|
||||
# throttles you with HTTP error 429
|
||||
#'--write-auto-subs',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
|
@ -168,7 +184,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
|
||||
'--format=(bv*+ba/b)[filesize<={}][filesize_approx<=?{}]/(bv*+ba/b)'.format(c['MEDIA_MAX_SIZE'], c['MEDIA_MAX_SIZE']),
|
||||
]},
|
||||
|
||||
|
||||
|
@ -229,6 +245,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
|
||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||
|
||||
'READWISE_READER_TOKENS': {'type': dict, 'default': {}},
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,9 @@ import re
|
|||
import logging
|
||||
import tempfile
|
||||
|
||||
import ldap
|
||||
from django_auth_ldap.config import LDAPSearch
|
||||
|
||||
from pathlib import Path
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
|
@ -20,6 +23,17 @@ from ..config import (
|
|||
OUTPUT_DIR,
|
||||
LOGS_DIR,
|
||||
TIMEZONE,
|
||||
|
||||
LDAP,
|
||||
LDAP_SERVER_URI,
|
||||
LDAP_BIND_DN,
|
||||
LDAP_BIND_PASSWORD,
|
||||
LDAP_USER_BASE,
|
||||
LDAP_USER_FILTER,
|
||||
LDAP_USERNAME_ATTR,
|
||||
LDAP_FIRSTNAME_ATTR,
|
||||
LDAP_LASTNAME_ATTR,
|
||||
LDAP_EMAIL_ATTR,
|
||||
)
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
|
@ -55,6 +69,12 @@ INSTALLED_APPS = [
|
|||
]
|
||||
|
||||
|
||||
# For usage with https://www.jetadmin.io/integrations/django
|
||||
# INSTALLED_APPS += ['jet_django']
|
||||
# JET_PROJECT = 'archivebox'
|
||||
# JET_TOKEN = 'some-api-token-here'
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
'core.middleware.TimezoneMiddleware',
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
|
@ -67,11 +87,48 @@ MIDDLEWARE = [
|
|||
'core.middleware.CacheControlMiddleware',
|
||||
]
|
||||
|
||||
################################################################################
|
||||
### Authentication Settings
|
||||
################################################################################
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
'django.contrib.auth.backends.RemoteUserBackend',
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
]
|
||||
|
||||
if LDAP:
|
||||
global AUTH_LDAP_SERVER_URI
|
||||
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
|
||||
|
||||
global AUTH_LDAP_BIND_DN
|
||||
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
|
||||
|
||||
global AUTH_LDAP_BIND_PASSWORD
|
||||
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
|
||||
|
||||
global AUTH_LDAP_USER_SEARCH
|
||||
AUTH_LDAP_USER_SEARCH = LDAPSearch(
|
||||
LDAP_USER_BASE,
|
||||
ldap.SCOPE_SUBTREE,
|
||||
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
|
||||
)
|
||||
|
||||
global AUTH_LDAP_USER_ATTR_MAP
|
||||
AUTH_LDAP_USER_ATTR_MAP = {
|
||||
'username': LDAP_USERNAME_ATTR,
|
||||
'first_name': LDAP_FIRSTNAME_ATTR,
|
||||
'last_name': LDAP_LASTNAME_ATTR,
|
||||
'email': LDAP_EMAIL_ATTR,
|
||||
}
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
'django_auth_ldap.backend.LDAPBackend',
|
||||
]
|
||||
|
||||
################################################################################
|
||||
### Debug Settings
|
||||
################################################################################
|
||||
|
||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||
if DEBUG_TOOLBAR:
|
||||
|
@ -267,8 +324,8 @@ class NoisyRequestsFilter(logging.Filter):
|
|||
if LOGS_DIR.exists():
|
||||
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
||||
else:
|
||||
# meh too many edge cases here around creating log dir w/ correct permissions
|
||||
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
|
||||
# historically too many edge cases here around creating log dir w/ correct permissions early on
|
||||
# if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
LOGGING = {
|
||||
|
|
|
@ -33,6 +33,9 @@ urlpatterns = [
|
|||
path('admin/', admin.site.urls),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
path('error/', lambda _: 1/0),
|
||||
|
||||
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
|
||||
|
||||
path('index.html', RedirectView.as_view(url='/')),
|
||||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||
|
|
|
@ -441,7 +441,7 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
|||
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
for line in list(hints)[:5] if line.strip()
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ from ..index.schema import Link
|
|||
from ..logging_util import TimedProgress, log_source_saved
|
||||
|
||||
from . import pocket_api
|
||||
from . import readwise_reader_api
|
||||
from . import wallabag_atom
|
||||
from . import pocket_html
|
||||
from . import pinboard_rss
|
||||
|
@ -51,6 +52,7 @@ from . import url_list
|
|||
PARSERS = {
|
||||
# Specialized parsers
|
||||
pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER),
|
||||
readwise_reader_api.KEY: (readwise_reader_api.NAME, readwise_reader_api.PARSER),
|
||||
wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER),
|
||||
pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER),
|
||||
pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER),
|
||||
|
|
|
@ -17,7 +17,10 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
links = json.load(json_file)
|
||||
|
||||
# sometimes the first line is a comment or filepath, so we get everything after the first {
|
||||
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
|
||||
links = json.loads(json_file_json_str)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
__package__ = "archivebox.parsers"
|
||||
|
||||
|
||||
import re
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
from typing import IO, Iterable, Optional
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import enforce_types
|
||||
from ..system import atomic_write
|
||||
from ..config import (
|
||||
SOURCES_DIR,
|
||||
READWISE_READER_TOKENS,
|
||||
)
|
||||
|
||||
|
||||
API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
|
||||
|
||||
|
||||
class ReadwiseReaderAPI:
|
||||
cursor: Optional[str]
|
||||
|
||||
def __init__(self, api_token, cursor=None) -> None:
|
||||
self.api_token = api_token
|
||||
self.cursor = cursor
|
||||
|
||||
def get_archive(self):
|
||||
response = requests.get(
|
||||
url="https://readwise.io/api/v3/list/",
|
||||
headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
|
||||
params={
|
||||
"location": "archive",
|
||||
"pageCursor": self.cursor,
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def get_readwise_reader_articles(api: ReadwiseReaderAPI):
|
||||
response = api.get_archive()
|
||||
body = response.json()
|
||||
articles = body["results"]
|
||||
|
||||
yield from articles
|
||||
|
||||
|
||||
if body['nextPageCursor']:
|
||||
api.cursor = body["nextPageCursor"]
|
||||
yield from get_readwise_reader_articles(api)
|
||||
|
||||
|
||||
def link_from_article(article: dict, sources: list):
|
||||
url: str = article['source_url']
|
||||
title = article["title"] or url
|
||||
timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
|
||||
|
||||
return Link(
|
||||
url=url,
|
||||
timestamp=str(timestamp),
|
||||
title=title,
|
||||
tags="",
|
||||
sources=sources,
|
||||
)
|
||||
|
||||
|
||||
def write_cursor(username: str, since: str):
|
||||
if not API_DB_PATH.exists():
|
||||
atomic_write(API_DB_PATH, "")
|
||||
|
||||
since_file = ConfigParser()
|
||||
since_file.optionxform = str
|
||||
since_file.read(API_DB_PATH)
|
||||
|
||||
since_file[username] = {"since": since}
|
||||
|
||||
with open(API_DB_PATH, "w+") as new:
|
||||
since_file.write(new)
|
||||
|
||||
|
||||
def read_cursor(username: str) -> Optional[str]:
|
||||
if not API_DB_PATH.exists():
|
||||
atomic_write(API_DB_PATH, "")
|
||||
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
config_file.read(API_DB_PATH)
|
||||
|
||||
return config_file.get(username, "since", fallback=None)
|
||||
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_parse_as_readwise_reader_api(text: str) -> bool:
|
||||
return text.startswith("readwise-reader://")
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse bookmarks from the Readwise Reader API"""
|
||||
|
||||
input_buffer.seek(0)
|
||||
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
|
||||
for line in input_buffer:
|
||||
if should_parse_as_readwise_reader_api(line):
|
||||
username = pattern.search(line).group(1)
|
||||
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
||||
|
||||
for article in get_readwise_reader_articles(api):
|
||||
yield link_from_article(article, sources=[line])
|
||||
|
||||
if api.cursor:
|
||||
write_cursor(username, api.cursor)
|
||||
|
||||
|
||||
KEY = "readwise_reader_api"
|
||||
NAME = "Readwise Reader API"
|
||||
PARSER = parse_readwise_reader_api_export
|
File diff suppressed because it is too large
Load Diff
|
@ -7,7 +7,8 @@
|
|||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
|
||||
"playwright": "^1.37.1",
|
||||
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
||||
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
|
||||
"single-file-cli": "^1.0.63"
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue