#!/usr/bin/env python3
import codecs
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from time import perf_counter
import httpx
import lassie
import lxml
import markdown
from jinja2 import Environment as Env
from jinja2 import FileSystemLoader
from minicli import cli, run, wrap
from readability.readability import Document
HERE = Path(".")
YEAR = "2024"
CACHE_PATH = HERE / "cache" / YEAR
LOCAL_DOMAIN = "http://larlet.test:3579"
environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
def parse_markdown(file_path):
"""Extract title, (HTML) content and metadata from a markdown file."""
parser = markdown.Markdown(extensions=["meta"])
with codecs.open(file_path, "r") as source:
content = parser.convert(source.read())
metadata = parser.Meta if hasattr(parser, "Meta") else None
title = metadata["title"][0] if metadata is not None else ""
return title, content, metadata
def each_markdown_from(source_dir, file_name="index.md"):
"""Walk across the `source_dir` and return the md file paths."""
for root, dirnames, filenames in os.walk(source_dir):
for filename in fnmatch.filter(filenames, file_name):
yield os.path.join(root, filename)
@dataclass
class Cache:
title: str
content: str
url: str
hash_url: str
archive_date: str
og_image: str
description: str
@staticmethod
def all(source_dir=CACHE_PATH):
for file_path in each_markdown_from(source_dir):
title, content, metadata = parse_markdown(file_path)
url = metadata["url"][0]
hash_url = metadata["hash_url"][0]
archive_date = metadata["archive_date"][0]
og_image = metadata.get("og_image", [""])[0]
description = metadata.get("description", [""])[0]
yield Cache(
title, content, url, hash_url, archive_date, og_image, description
)
@staticmethod
def one(hash_url):
return next(Cache.all(source_dir=CACHE_PATH / hash_url))
def extract_page(url):
"""From an URL, extract title and content using Readability.
The title is shortened through the `short_title` native method.
The content doesn't contain `
` tags to be directly
embeddable in the template and rendered as is.
"""
# Retrieves the resource and turns it into a Readability doc.
response = httpx.get(url)
document = Document(response.text)
# The short title is more concise and readable.
title = document.short_title()
content = document.summary(html_partial=True)
# Removing the added and spaces.
content = content[5:-6].strip()
return title, content
def create(hash_url):
"""Turn new MD file into HTML file."""
template = environment.get_template("cache_article.html")
cache = Cache.one(hash_url)
page = template.render(cache=cache)
cache_target = CACHE_PATH / hash_url
if not os.path.exists(cache_target):
os.makedirs(cache_target)
open(cache_target / "index.html", "w").write(page)
print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")
def save(
title, content, url, hash_url, archive_date, cache_path, og_image, description
):
template = environment.get_template("cache_article.md")
page = template.render(
title=title,
content=content,
url=url,
hash_url=hash_url,
archive_date=archive_date,
og_image=og_image,
description=description,
)
result_path = os.path.join(cache_path, "index.md")
open(result_path, "w").write(page)
return result_path
@cli
def generate():
"""Generate caches MD files into HTML files."""
cache_list = []
template = environment.get_template("cache_article.html")
for cache in Cache.all():
page = template.render(cache=cache)
open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
cache_list.append(cache)
template = environment.get_template("cache_archives.html")
page = template.render(cache_list=cache_list)
open(CACHE_PATH / "index.html", "w").write(page)
print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")
def fetch_metadata(title, url, description):
"""Fetch additional metadata."""
data = lassie.fetch(url)
og_image = ""
for image in data.get("images"):
if image.get("type") == "og:image":
og_image = image["src"]
break
if data.get("title") != title:
print(data.get("title"), "vs.", title, url)
description = description or data.get("description", "")
return og_image, description
@cli
def metadata():
"""Fetch additional metadata for existing archives."""
for cache in Cache.all():
if cache.description or cache.url.startswith(
(
"https://www.la-grange.net",
"https://tw5.immateriel.fr",
"https://gilest.org",
"https://vasilis.nl",
"https://www.danmcquillan.org",
"https://public-inbox.org",
)
):
print("Skipping", cache.url)
continue
print("Fetching metadata for", cache.url, cache.title)
og_image, description = fetch_metadata(
cache.title, cache.url, cache.description
)
save(
cache.title,
cache.content,
cache.url,
cache.hash_url,
cache.archive_date,
os.path.join(CACHE_PATH, cache.hash_url),
og_image,
description,
)
@cli
def new(url):
"""Turn the given URL into a MD and a HTML files.
:url: The URL of the page to put into cache.
"""
hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
try:
title, content = extract_page(url)
except (
lxml.etree.XMLSyntaxError,
httpx.HTTPError,
httpx.ReadTimeout,
) as e:
print(f"WARNING: {e}")
title, content = "", ""
cache_path = os.path.join(CACHE_PATH, hash_url)
if not os.path.exists(cache_path):
os.makedirs(cache_path)
archive_date = date.today()
# Caching a markdown file.
og_image, description = fetch_metadata(title, url, "")
result_path = save(
title, content, url, hash_url, archive_date, cache_path, og_image, description
)
# Generating the HTML file.
create(hash_url)
md_line = f"> *[{title}]({url})*"
print(md_line)
os.popen(f'subl "{result_path}"')
@wrap
def perf_wrapper():
start = perf_counter()
yield
elapsed = perf_counter() - start
print(f"Done in {elapsed:.5f} seconds.")
if __name__ == "__main__":
run()