#!/usr/bin/env python3 import codecs import fnmatch import hashlib import os from dataclasses import dataclass from datetime import date from pathlib import Path from time import perf_counter import httpx import lassie import lxml import markdown from jinja2 import Environment as Env from jinja2 import FileSystemLoader from minicli import cli, run, wrap from readability.readability import Document HERE = Path(".") YEAR = "2024" CACHE_PATH = HERE / "cache" / YEAR LOCAL_DOMAIN = "http://larlet.test:3579" environment = Env(loader=FileSystemLoader(str(HERE / "templates"))) def parse_markdown(file_path): """Extract title, (HTML) content and metadata from a markdown file.""" parser = markdown.Markdown(extensions=["meta"]) with codecs.open(file_path, "r") as source: content = parser.convert(source.read()) metadata = parser.Meta if hasattr(parser, "Meta") else None title = metadata["title"][0] if metadata is not None else "" return title, content, metadata def each_markdown_from(source_dir, file_name="index.md"): """Walk across the `source_dir` and return the md file paths.""" for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, file_name): yield os.path.join(root, filename) @dataclass class Cache: title: str content: str url: str hash_url: str archive_date: str og_image: str description: str @staticmethod def all(source_dir=CACHE_PATH): for file_path in each_markdown_from(source_dir): title, content, metadata = parse_markdown(file_path) url = metadata["url"][0] hash_url = metadata["hash_url"][0] archive_date = metadata["archive_date"][0] og_image = metadata.get("og_image", [""])[0] description = metadata.get("description", [""])[0] yield Cache( title, content, url, hash_url, archive_date, og_image, description ) @staticmethod def one(hash_url): return next(Cache.all(source_dir=CACHE_PATH / hash_url)) def extract_page(url): """From an URL, extract title and content using Readability. The title is shortened through the `short_title` native method. The content doesn't contain `` tags to be directly embeddable in the template and rendered as is. """ # Retrieves the resource and turns it into a Readability doc. response = httpx.get(url) document = Document(response.text) # The short title is more concise and readable. title = document.short_title() content = document.summary(html_partial=True) # Removing the added
and spaces. content = content[5:-6].strip() return title, content def create(hash_url): """Turn new MD file into HTML file.""" template = environment.get_template("cache_article.html") cache = Cache.one(hash_url) page = template.render(cache=cache) cache_target = CACHE_PATH / hash_url if not os.path.exists(cache_target): os.makedirs(cache_target) open(cache_target / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/") def save( title, content, url, hash_url, archive_date, cache_path, og_image, description ): template = environment.get_template("cache_article.md") page = template.render( title=title, content=content, url=url, hash_url=hash_url, archive_date=archive_date, og_image=og_image, description=description, ) result_path = os.path.join(cache_path, "index.md") open(result_path, "w").write(page) return result_path @cli def generate(): """Generate caches MD files into HTML files.""" cache_list = [] template = environment.get_template("cache_article.html") for cache in Cache.all(): page = template.render(cache=cache) open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) cache_list.append(cache) template = environment.get_template("cache_archives.html") page = template.render(cache_list=cache_list) open(CACHE_PATH / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/") def fetch_metadata(title, url, description): """Fetch additional metadata.""" data = lassie.fetch(url) og_image = "" for image in data.get("images"): if image.get("type") == "og:image": og_image = image["src"] break if data.get("title") != title: print(data.get("title"), "vs.", title, url) description = description or data.get("description", "") return og_image, description @cli def metadata(): """Fetch additional metadata for existing archives.""" for cache in Cache.all(): if cache.description or cache.url.startswith( ( "https://www.la-grange.net", "https://tw5.immateriel.fr", "https://gilest.org", "https://vasilis.nl", "https://www.danmcquillan.org", "https://public-inbox.org", ) ): print("Skipping", cache.url) continue print("Fetching metadata for", cache.url, cache.title) og_image, description = fetch_metadata( cache.title, cache.url, cache.description ) save( cache.title, cache.content, cache.url, cache.hash_url, cache.archive_date, os.path.join(CACHE_PATH, cache.hash_url), og_image, description, ) @cli def new(url): """Turn the given URL into a MD and a HTML files. :url: The URL of the page to put into cache. """ hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() try: title, content = extract_page(url) except ( lxml.etree.XMLSyntaxError, httpx.HTTPError, httpx.ReadTimeout, ) as e: print(f"WARNING: {e}") title, content = "", "" cache_path = os.path.join(CACHE_PATH, hash_url) if not os.path.exists(cache_path): os.makedirs(cache_path) archive_date = date.today() # Caching a markdown file. og_image, description = fetch_metadata(title, url, "") result_path = save( title, content, url, hash_url, archive_date, cache_path, og_image, description ) # Generating the HTML file. create(hash_url) md_line = f"> *[{title}]({url})*" print(md_line) os.popen(f'subl "{result_path}"') @wrap def perf_wrapper(): start = perf_counter() yield elapsed = perf_counter() - start print(f"Done in {elapsed:.5f} seconds.") if __name__ == "__main__": run()