davidbgk
/
larlet-fr-david-cache


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
							#!/usr/bin/env python3

import codecs
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from time import perf_counter

import httpx
import lassie
import lxml
import markdown
from jinja2 import Environment as Env
from jinja2 import FileSystemLoader
from minicli import cli, run, wrap
from readability.readability import Document

HERE = Path(".")
YEAR = "2024"
CACHE_PATH = HERE / "cache" / YEAR
LOCAL_DOMAIN = "http://larlet.test:3579"


environment = Env(loader=FileSystemLoader(str(HERE / "templates")))


def parse_markdown(file_path):
    """Extract title, (HTML) content and metadata from a markdown file."""
    parser = markdown.Markdown(extensions=["meta"])
    with codecs.open(file_path, "r") as source:
        content = parser.convert(source.read())
        metadata = parser.Meta if hasattr(parser, "Meta") else None
        title = metadata["title"][0] if metadata is not None else ""
        return title, content, metadata


def each_markdown_from(source_dir, file_name="index.md"):
    """Walk across the `source_dir` and return the md file paths."""
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, file_name):
            yield os.path.join(root, filename)


@dataclass
class Cache:
    title: str
    content: str
    url: str
    hash_url: str
    archive_date: str
    og_image: str
    description: str

    @staticmethod
    def all(source_dir=CACHE_PATH):
        for file_path in each_markdown_from(source_dir):
            title, content, metadata = parse_markdown(file_path)
            url = metadata["url"][0]
            hash_url = metadata["hash_url"][0]
            archive_date = metadata["archive_date"][0]
            og_image = metadata.get("og_image", [""])[0]
            description = metadata.get("description", [""])[0]
            yield Cache(
                title, content, url, hash_url, archive_date, og_image, description
            )

    @staticmethod
    def one(hash_url):
        return next(Cache.all(source_dir=CACHE_PATH / hash_url))


def extract_page(url):
    """From an URL, extract title and content using Readability.

    The title is shortened through the `short_title` native method.
    The content doesn't contain `<body>` tags to be directly
    embeddable in the template and rendered as is.
    """
    # Retrieves the resource and turns it into a Readability doc.
    response = httpx.get(url)
    document = Document(response.text)

    # The short title is more concise and readable.
    title = document.short_title()
    content = document.summary(html_partial=True)

    # Removing the added <div> and spaces.
    content = content[5:-6].strip()
    return title, content


def create(hash_url):
    """Turn new MD file into HTML file."""
    template = environment.get_template("cache_article.html")
    cache = Cache.one(hash_url)
    page = template.render(cache=cache)
    cache_target = CACHE_PATH / hash_url
    if not os.path.exists(cache_target):
        os.makedirs(cache_target)
    open(cache_target / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")


def save(
    title, content, url, hash_url, archive_date, cache_path, og_image, description
):
    template = environment.get_template("cache_article.md")
    page = template.render(
        title=title,
        content=content,
        url=url,
        hash_url=hash_url,
        archive_date=archive_date,
        og_image=og_image,
        description=description,
    )
    result_path = os.path.join(cache_path, "index.md")
    open(result_path, "w").write(page)
    return result_path


@cli
def generate():
    """Generate caches MD files into HTML files."""
    cache_list = []
    template = environment.get_template("cache_article.html")
    for cache in Cache.all():
        page = template.render(cache=cache)
        open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
        cache_list.append(cache)

    template = environment.get_template("cache_archives.html")
    page = template.render(cache_list=cache_list)
    open(CACHE_PATH / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")


def fetch_metadata(title, url, description):
    """Fetch additional metadata."""
    data = lassie.fetch(url)
    og_image = ""
    for image in data.get("images"):
        if image.get("type") == "og:image":
            og_image = image["src"]
            break
    if data.get("title") != title:
        print(data.get("title"), "vs.", title, url)
    description = description or data.get("description", "")
    return og_image, description


@cli
def metadata():
    """Fetch additional metadata for existing archives."""
    for cache in Cache.all():
        if cache.description or cache.url.startswith(
            (
                "https://www.la-grange.net",
                "https://tw5.immateriel.fr",
                "https://gilest.org",
                "https://vasilis.nl",
                "https://www.danmcquillan.org",
                "https://public-inbox.org",
            )
        ):
            print("Skipping", cache.url)
            continue
        print("Fetching metadata for", cache.url, cache.title)
        og_image, description = fetch_metadata(
            cache.title, cache.url, cache.description
        )
        save(
            cache.title,
            cache.content,
            cache.url,
            cache.hash_url,
            cache.archive_date,
            os.path.join(CACHE_PATH, cache.hash_url),
            og_image,
            description,
        )


@cli
def new(url):
    """Turn the given URL into a MD and a HTML files.

    :url: The URL of the page to put into cache.
    """
    hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
    try:
        title, content = extract_page(url)
    except (
        lxml.etree.XMLSyntaxError,
        httpx.HTTPError,
        httpx.ReadTimeout,
    ) as e:
        print(f"WARNING: {e}")
        title, content = "", ""
    cache_path = os.path.join(CACHE_PATH, hash_url)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)
    archive_date = date.today()
    # Caching a markdown file.
    og_image, description = fetch_metadata(title, url, "")
    result_path = save(
        title, content, url, hash_url, archive_date, cache_path, og_image, description
    )
    # Generating the HTML file.
    create(hash_url)
    md_line = f"> <cite>*[{title}]({url})*</cite>"
    print(md_line)
    os.popen(f'subl "{result_path}"')


@wrap
def perf_wrapper():
    start = perf_counter()
    yield
    elapsed = perf_counter() - start
    print(f"Done in {elapsed:.5f} seconds.")


if __name__ == "__main__":
    run()