davidbgk
/
larlet-fr-david-cache


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
							#!/usr/bin/env python3

import codecs
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from time import perf_counter

import httpx
import lxml
import markdown
from jinja2 import Environment as Env
from jinja2 import FileSystemLoader
from minicli import cli, run, wrap
from readability.readability import Document

HERE = Path(".")
YEAR = "2024"
CACHE_PATH = HERE / "cache" / YEAR
LOCAL_DOMAIN = "http://larlet.test:3579"


environment = Env(loader=FileSystemLoader(str(HERE / "templates")))


def parse_markdown(file_path):
    """Extract title, (HTML) content and metadata from a markdown file."""
    parser = markdown.Markdown(extensions=["meta"])
    with codecs.open(file_path, "r") as source:
        content = parser.convert(source.read())
        metadata = parser.Meta if hasattr(parser, "Meta") else None
        title = metadata["title"][0] if metadata is not None else ""
        return title, content, metadata


def each_markdown_from(source_dir, file_name="index.md"):
    """Walk across the `source_dir` and return the md file paths."""
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, file_name):
            yield os.path.join(root, filename)


@dataclass
class Cache:
    title: str
    content: str
    url: str
    hash_url: str
    archive_date: str

    @staticmethod
    def all(source_dir=CACHE_PATH):
        for file_path in each_markdown_from(source_dir):
            title, content, metadata = parse_markdown(file_path)
            url = metadata["url"][0]
            hash_url = metadata["hash_url"][0]
            archive_date = metadata["archive_date"][0]
            yield Cache(title, content, url, hash_url, archive_date)

    @staticmethod
    def one(hash_url):
        return next(Cache.all(source_dir=CACHE_PATH / hash_url))


def extract_page(url):
    """From an URL, extract title and content using Readability.

    The title is shortened through the `short_title` native method.
    The content doesn't contain `<body>` tags to be directly
    embeddable in the template and rendered as is.
    """
    # Retrieves the resource and turns it into a Readability doc.
    response = httpx.get(url)
    document = Document(response.text)

    # The short title is more concise and readable.
    title = document.short_title()
    content = document.summary(html_partial=True)

    # Removing the added <div> and spaces.
    content = content[5:-6].strip()
    return title, content


def create(hash_url):
    """Turn new MD file into HTML file."""
    template = environment.get_template("cache_article.html")
    cache = Cache.one(hash_url)
    page = template.render(cache=cache)
    cache_target = CACHE_PATH / hash_url
    if not os.path.exists(cache_target):
        os.makedirs(cache_target)
    open(cache_target / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")


@cli
def generate():
    """Generate caches MD files into HTML files."""
    cache_list = []
    template = environment.get_template("cache_article.html")
    for cache in Cache.all():
        page = template.render(cache=cache)
        open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
        cache_list.append(cache)

    template = environment.get_template("cache_archives.html")
    page = template.render(cache_list=cache_list)
    open(CACHE_PATH / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")


@cli
def new(url):
    """Turn the given URL into a MD and a HTML files.

    :url: The URL of the page to put into cache.
    """
    hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
    try:
        title, content = extract_page(url)
    except (
        lxml.etree.XMLSyntaxError,
        httpx.HTTPError,
        httpx.ReadTimeout,
    ) as e:
        print(f"WARNING: {e}")
        title, content = "", ""
    cache_path = os.path.join(CACHE_PATH, hash_url)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)
    archive_date = date.today()
    # Caching a markdown file.
    template = environment.get_template("cache_article.md")
    page = template.render(
        title=title,
        content=content,
        url=url,
        hash_url=hash_url,
        archive_date=archive_date,
    )
    result_path = os.path.join(cache_path, "index.md")
    open(result_path, "w").write(page)
    # Generating the HTML file.
    create(hash_url)
    os.popen(f'subl "{result_path}"')


@wrap
def perf_wrapper():
    start = perf_counter()
    yield
    elapsed = perf_counter() - start
    print(f"Done in {elapsed:.5f} seconds.")


if __name__ == "__main__":
    run()