#!/usr/bin/env python3 import codecs import fnmatch import hashlib import os from dataclasses import dataclass from datetime import date from pathlib import Path from time import perf_counter import httpx import lxml import markdown from jinja2 import Environment as Env from jinja2 import FileSystemLoader from minicli import cli, run, wrap from readability.readability import Document HERE = Path(".") YEAR = "2024" CACHE_PATH = HERE / "cache" / YEAR LOCAL_DOMAIN = "http://larlet.test:3579" environment = Env(loader=FileSystemLoader(str(HERE / "templates"))) def parse_markdown(file_path): """Extract title, (HTML) content and metadata from a markdown file.""" parser = markdown.Markdown(extensions=["meta"]) with codecs.open(file_path, "r") as source: content = parser.convert(source.read()) metadata = parser.Meta if hasattr(parser, "Meta") else None title = metadata["title"][0] if metadata is not None else "" return title, content, metadata def each_markdown_from(source_dir, file_name="index.md"): """Walk across the `source_dir` and return the md file paths.""" for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, file_name): yield os.path.join(root, filename) @dataclass class Cache: title: str content: str url: str hash_url: str archive_date: str @staticmethod def all(source_dir=CACHE_PATH): for file_path in each_markdown_from(source_dir): title, content, metadata = parse_markdown(file_path) url = metadata["url"][0] hash_url = metadata["hash_url"][0] archive_date = metadata["archive_date"][0] yield Cache(title, content, url, hash_url, archive_date) @staticmethod def one(hash_url): return next(Cache.all(source_dir=CACHE_PATH / hash_url)) def extract_page(url): """From an URL, extract title and content using Readability. The title is shortened through the `short_title` native method. The content doesn't contain `` tags to be directly embeddable in the template and rendered as is. """ # Retrieves the resource and turns it into a Readability doc. response = httpx.get(url) document = Document(response.text) # The short title is more concise and readable. title = document.short_title() content = document.summary(html_partial=True) # Removing the added
and spaces. content = content[5:-6].strip() return title, content def create(hash_url): """Turn new MD file into HTML file.""" template = environment.get_template("cache_article.html") cache = Cache.one(hash_url) page = template.render(cache=cache) cache_target = CACHE_PATH / hash_url if not os.path.exists(cache_target): os.makedirs(cache_target) open(cache_target / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/") @cli def generate(): """Generate caches MD files into HTML files.""" cache_list = [] template = environment.get_template("cache_article.html") for cache in Cache.all(): page = template.render(cache=cache) open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) cache_list.append(cache) template = environment.get_template("cache_archives.html") page = template.render(cache_list=cache_list) open(CACHE_PATH / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/") @cli def new(url): """Turn the given URL into a MD and a HTML files. :url: The URL of the page to put into cache. """ hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() try: title, content = extract_page(url) except ( lxml.etree.XMLSyntaxError, httpx.HTTPError, httpx.ReadTimeout, ) as e: print(f"WARNING: {e}") title, content = "", "" cache_path = os.path.join(CACHE_PATH, hash_url) if not os.path.exists(cache_path): os.makedirs(cache_path) archive_date = date.today() # Caching a markdown file. template = environment.get_template("cache_article.md") page = template.render( title=title, content=content, url=url, hash_url=hash_url, archive_date=archive_date, ) result_path = os.path.join(cache_path, "index.md") open(result_path, "w").write(page) # Generating the HTML file. create(hash_url) md_line = f"> *[{title}]({url})*" print(md_line) os.popen(f'subl "{result_path}"') @wrap def perf_wrapper(): start = perf_counter() yield elapsed = perf_counter() - start print(f"Done in {elapsed:.5f} seconds.") if __name__ == "__main__": run()