#!/usr/bin/env python3 import codecs import fnmatch import hashlib import os from dataclasses import dataclass from datetime import date from pathlib import Path from time import perf_counter from urllib.parse import urlparse import httpx import lassie import lxml import markdown from jinja2 import Environment as Env from jinja2 import FileSystemLoader from minicli import cli, run, wrap from readability.readability import Document HERE = Path(".") YEAR = "2024" CACHE_PATH = HERE / "cache" / YEAR LOCAL_DOMAIN = "http://larlet.test:3579" environment = Env(loader=FileSystemLoader(str(HERE / "templates"))) def parse_markdown(file_path): """Extract title, (HTML) content and metadata from a markdown file.""" parser = markdown.Markdown(extensions=["meta"]) with codecs.open(file_path, "r") as source: content = parser.convert(source.read()) metadata = parser.Meta if hasattr(parser, "Meta") else None title = metadata["title"][0] if metadata is not None else "" return title, content, metadata def each_markdown_from(source_dir, file_name="index.md"): """Walk across the `source_dir` and return the md file paths.""" for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, file_name): yield os.path.join(root, filename) @dataclass class Cache: title: str content: str url: str hash_url: str archive_date: str og_image: str description: str favicon: str language: str @staticmethod def all(source_dir=CACHE_PATH): for file_path in each_markdown_from(source_dir): title, content, metadata = parse_markdown(file_path) url = metadata["url"][0] hash_url = metadata["hash_url"][0] archive_date = metadata["archive_date"][0] og_image = metadata.get("og_image", [""])[0] description = metadata.get("description", [""])[0] favicon = metadata.get("favicon", [""])[0] language = metadata.get("language", [""])[0] yield Cache( title, content, url, hash_url, archive_date, og_image, description, favicon, language, ) @staticmethod def one(hash_url): return next(Cache.all(source_dir=CACHE_PATH / hash_url)) def extract_page(url): """From an URL, extract title and content using Readability. The title is shortened through the `short_title` native method. The content doesn't contain `` tags to be directly embeddable in the template and rendered as is. """ # Retrieves the resource and turns it into a Readability doc. response = httpx.get(url) document = Document(response.text) # The short title is more concise and readable. title = document.short_title() content = document.summary(html_partial=True) # Removing the added
and spaces. content = content[5:-6].strip() return title, content def create(hash_url): """Turn new MD file into HTML file.""" template = environment.get_template("cache_article.html") cache = Cache.one(hash_url) page = template.render(cache=cache) cache_target = CACHE_PATH / hash_url if not os.path.exists(cache_target): os.makedirs(cache_target) open(cache_target / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/") def save( title, content, url, hash_url, archive_date, cache_path, og_image, description, favicon, language, ): template = environment.get_template("cache_article.md") page = template.render( title=title, content=content, url=url, hash_url=hash_url, archive_date=archive_date, og_image=og_image, description=description, favicon=favicon, language=language, ) result_path = os.path.join(cache_path, "index.md") open(result_path, "w").write(page) return result_path @cli def generate(): """Generate caches MD files into HTML files.""" cache_list = [] template = environment.get_template("cache_article.html") for cache in Cache.all(): page = template.render(cache=cache) open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) cache_list.append(cache) template = environment.get_template("cache_archives.html") page = template.render(cache_list=cache_list) open(CACHE_PATH / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/") def fetch_metadata(title, url, description, language): """Fetch additional metadata.""" parsed_url = urlparse(url) root_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" data = lassie.fetch(url, all_images=True) og_image = "" favicon = "" for image in data.get("images"): image_type = image.get("type") image_src = image.get("src") if image_src == root_url: og_image = "" continue elif image_type == "og:image" and not og_image: og_image = image_src continue elif image_type == "twitter:image" and not og_image: og_image = image_src continue elif image_type == "favicon": if not favicon: favicon = image_src elif ".ico" in favicon and ".ico" not in image_src: favicon = image_src if not og_image: for image in data.get("images"): if image_type == "body_image": image_src = image.get("src") if image_src and "favicon" not in image_src: og_image = image_src break # Fallback on server's default. if not favicon: favico_url = f"{root_url}favicon.ico" response = httpx.get(favico_url) if response.status_code == 200: favicon = favico_url if data.get("title") != title: print(data.get("title"), "vs.", title, url) description = description or data.get("description", "") language = language or data.get("locale", "") return og_image, description, favicon, language @cli def metadata(): """Fetch additional metadata for existing archives.""" for cache in Cache.all(): # That one is taking way too long. if cache.url.startswith("https://tw5.immateriel.fr"): print("Skipping (too long)", cache.url) continue if cache.og_image and cache.description and cache.favicon and cache.language: print("Skipping (all good)", cache.url) continue if cache.url.startswith( ( "https://www.la-grange.net", "https://gilest.org", "https://vasilis.nl", "https://www.danmcquillan.org", "https://public-inbox.org", ) ) and (cache.og_image or cache.description or cache.favicon or cache.language): print("Skipping (known missing infos)", cache.url) continue print("Fetching metadata for", cache.url, cache.title) og_image, description, favicon, language = fetch_metadata( cache.title, cache.url, cache.description, cache.language ) save( cache.title, cache.content, cache.url, cache.hash_url, cache.archive_date, os.path.join(CACHE_PATH, cache.hash_url), og_image, description, favicon, language, ) @cli def new(url): """Turn the given URL into a MD and a HTML files. :url: The URL of the page to put into cache. """ hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() try: title, content = extract_page(url) except ( lxml.etree.XMLSyntaxError, httpx.HTTPError, httpx.ReadTimeout, ) as e: print(f"WARNING: {e}") title, content = "", "" cache_path = os.path.join(CACHE_PATH, hash_url) if not os.path.exists(cache_path): os.makedirs(cache_path) archive_date = date.today() # Caching a markdown file. og_image, description, favicon, language = fetch_metadata(title, url, "", "") result_path = save( title, content, url, hash_url, archive_date, cache_path, og_image, description, favicon, language, ) # Generating the HTML file. create(hash_url) md_line = f"> *[{title}]({url})*" print(md_line) os.popen(f'subl "{result_path}"') @wrap def perf_wrapper(): start = perf_counter() yield elapsed = perf_counter() - start print(f"Done in {elapsed:.5f} seconds.") if __name__ == "__main__": run()