davidbgk
/
larlet-fr-david-cache


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
							#!/usr/bin/env python3

import codecs
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from time import perf_counter
from urllib.parse import urlparse

import httpx
import lassie
import lxml
import markdown
from jinja2 import Environment as Env
from jinja2 import FileSystemLoader
from minicli import cli, run, wrap
from readability.readability import Document

HERE = Path(".")
YEAR = "2024"
CACHE_PATH = HERE / "cache" / YEAR
LOCAL_DOMAIN = "http://larlet.test:3579"


environment = Env(loader=FileSystemLoader(str(HERE / "templates")))


def parse_markdown(file_path):
    """Extract title, (HTML) content and metadata from a markdown file."""
    parser = markdown.Markdown(extensions=["meta"])
    with codecs.open(file_path, "r") as source:
        content = parser.convert(source.read())
        metadata = parser.Meta if hasattr(parser, "Meta") else None
        title = metadata["title"][0] if metadata is not None else ""
        return title, content, metadata


def each_markdown_from(source_dir, file_name="index.md"):
    """Walk across the `source_dir` and return the md file paths."""
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, file_name):
            yield os.path.join(root, filename)


@dataclass
class Cache:
    title: str
    content: str
    url: str
    hash_url: str
    archive_date: str
    og_image: str
    description: str
    favicon: str
    language: str

    @staticmethod
    def all(source_dir=CACHE_PATH):
        for file_path in each_markdown_from(source_dir):
            title, content, metadata = parse_markdown(file_path)
            url = metadata["url"][0]
            hash_url = metadata["hash_url"][0]
            archive_date = metadata["archive_date"][0]
            og_image = metadata.get("og_image", [""])[0]
            description = metadata.get("description", [""])[0]
            favicon = metadata.get("favicon", [""])[0]
            language = metadata.get("language", [""])[0]
            yield Cache(
                title,
                content,
                url,
                hash_url,
                archive_date,
                og_image,
                description,
                favicon,
                language,
            )

    @staticmethod
    def one(hash_url):
        return next(Cache.all(source_dir=CACHE_PATH / hash_url))


def extract_page(url):
    """From an URL, extract title and content using Readability.

    The title is shortened through the `short_title` native method.
    The content doesn't contain `<body>` tags to be directly
    embeddable in the template and rendered as is.
    """
    # Retrieves the resource and turns it into a Readability doc.
    response = httpx.get(url)
    document = Document(response.text)

    # The short title is more concise and readable.
    title = document.short_title()
    content = document.summary(html_partial=True)

    # Removing the added <div> and spaces.
    content = content[5:-6].strip()
    return title, content


def create(hash_url):
    """Turn new MD file into HTML file."""
    template = environment.get_template("cache_article.html")
    cache = Cache.one(hash_url)
    page = template.render(cache=cache)
    cache_target = CACHE_PATH / hash_url
    if not os.path.exists(cache_target):
        os.makedirs(cache_target)
    open(cache_target / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")


def save(
    title,
    content,
    url,
    hash_url,
    archive_date,
    cache_path,
    og_image,
    description,
    favicon,
    language,
):
    template = environment.get_template("cache_article.md")
    page = template.render(
        title=title,
        content=content,
        url=url,
        hash_url=hash_url,
        archive_date=archive_date,
        og_image=og_image,
        description=description,
        favicon=favicon,
        language=language,
    )
    result_path = os.path.join(cache_path, "index.md")
    open(result_path, "w").write(page)
    return result_path


@cli
def generate():
    """Generate caches MD files into HTML files."""
    cache_list = []
    template = environment.get_template("cache_article.html")
    for cache in Cache.all():
        page = template.render(cache=cache)
        open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
        cache_list.append(cache)

    template = environment.get_template("cache_archives.html")
    page = template.render(cache_list=cache_list)
    open(CACHE_PATH / "index.html", "w").write(page)
    print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")


def fetch_metadata(title, url, description, language):
    """Fetch additional metadata."""
    parsed_url = urlparse(url)
    root_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    data = lassie.fetch(url, all_images=True)
    og_image = ""
    favicon = ""
    for image in data.get("images"):
        image_type = image.get("type")
        image_src = image.get("src")
        if image_src == root_url:
            og_image = ""
            continue
        elif image_type == "og:image" and not og_image:
            og_image = image_src
            continue
        elif image_type == "twitter:image" and not og_image:
            og_image = image_src
            continue
        elif image_type == "favicon":
            if not favicon:
                favicon = image_src
            elif ".ico" in favicon and ".ico" not in image_src:
                favicon = image_src

    if not og_image:
        for image in data.get("images"):
            if image_type == "body_image":
                image_src = image.get("src")
                if image_src and "favicon" not in image_src:
                    og_image = image_src
                    break

    # Fallback on server's default.
    if not favicon:
        favico_url = f"{root_url}favicon.ico"
        response = httpx.get(favico_url)
        if response.status_code == 200:
            favicon = favico_url

    if data.get("title") != title:
        print(data.get("title"), "vs.", title, url)
    description = description or data.get("description", "")
    language = language or data.get("locale", "")
    return og_image, description, favicon, language


@cli
def metadata():
    """Fetch additional metadata for existing archives."""
    for cache in Cache.all():
        # That one is taking way too long.
        if cache.url.startswith("https://tw5.immateriel.fr"):
            print("Skipping (too long)", cache.url)
            continue
        if cache.og_image and cache.description and cache.favicon and cache.language:
            print("Skipping (all good)", cache.url)
            continue
        if cache.url.startswith(
            (
                "https://www.la-grange.net",
                "https://gilest.org",
                "https://vasilis.nl",
                "https://www.danmcquillan.org",
                "https://public-inbox.org",
            )
        ) and (cache.og_image or cache.description or cache.favicon or cache.language):
            print("Skipping (known missing infos)", cache.url)
            continue
        print("Fetching metadata for", cache.url, cache.title)
        og_image, description, favicon, language = fetch_metadata(
            cache.title, cache.url, cache.description, cache.language
        )
        save(
            cache.title,
            cache.content,
            cache.url,
            cache.hash_url,
            cache.archive_date,
            os.path.join(CACHE_PATH, cache.hash_url),
            og_image,
            description,
            favicon,
            language,
        )


@cli
def new(url):
    """Turn the given URL into a MD and a HTML files.

    :url: The URL of the page to put into cache.
    """
    hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
    try:
        title, content = extract_page(url)
    except (
        lxml.etree.XMLSyntaxError,
        httpx.HTTPError,
        httpx.ReadTimeout,
    ) as e:
        print(f"WARNING: {e}")
        title, content = "", ""
    cache_path = os.path.join(CACHE_PATH, hash_url)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)
    archive_date = date.today()
    # Caching a markdown file.
    og_image, description, favicon, language = fetch_metadata(title, url, "", "")
    result_path = save(
        title,
        content,
        url,
        hash_url,
        archive_date,
        cache_path,
        og_image,
        description,
        favicon,
        language,
    )
    # Generating the HTML file.
    create(hash_url)
    md_line = f"> <cite>*[{title}]({url})*</cite>"
    print(md_line)
    os.popen(f'subl "{result_path}"')


@wrap
def perf_wrapper():
    start = perf_counter()
    yield
    elapsed = perf_counter() - start
    print(f"Done in {elapsed:.5f} seconds.")


if __name__ == "__main__":
    run()