#!/usr/bin/env python3 import codecs import fnmatch import hashlib import os from dataclasses import dataclass from datetime import date from pathlib import Path from time import perf_counter import httpx import lxml import markdown from jinja2 import Environment as Env from jinja2 import FileSystemLoader from minicli import cli, run, wrap from readability.readability import Document HERE = Path(".") YEAR = "2024" CACHE_PATH = HERE / "cache" / YEAR LOCAL_DOMAIN = "http://larlet.test:3579" environment = Env(loader=FileSystemLoader(str(HERE / "templates"))) def parse_markdown(file_path): """Extract title, (HTML) content and metadata from a markdown file.""" parser = markdown.Markdown(extensions=["meta"]) with codecs.open(file_path, "r") as source: content = parser.convert(source.read()) metadata = parser.Meta if hasattr(parser, "Meta") else None title = metadata["title"][0] if metadata is not None else "" return title, content, metadata def each_markdown_from(source_dir, file_name="index.md"): """Walk across the `source_dir` and return the md file paths.""" for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, file_name): yield os.path.join(root, filename) @dataclass class Cache: title: str content: str url: str hash_url: str archive_date: str @staticmethod def all(source_dir=CACHE_PATH): for file_path in each_markdown_from(source_dir): title, content, metadata = parse_markdown(file_path) url = metadata["url"][0] hash_url = metadata["hash_url"][0] archive_date = metadata["archive_date"][0] yield Cache(title, content, url, hash_url, archive_date) @staticmethod def one(hash_url): return next(Cache.all(source_dir=CACHE_PATH / hash_url)) def extract_page(url): """From an URL, extract title and content using Readability. The title is shortened through the `short_title` native method. The content doesn't contain `
` tags to be directly embeddable in the template and rendered as is. """ # Retrieves the resource and turns it into a Readability doc. response = httpx.get(url) document = Document(response.text) # The short title is more concise and readable. title = document.short_title() content = document.summary(html_partial=True) # Removing the added