123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- #!/usr/bin/env python3
-
- import codecs
- import fnmatch
- import hashlib
- import os
- from dataclasses import dataclass
- from datetime import date
- from pathlib import Path
- from time import perf_counter
-
- import httpx
- import lassie
- import lxml
- import markdown
- from jinja2 import Environment as Env
- from jinja2 import FileSystemLoader
- from minicli import cli, run, wrap
- from readability.readability import Document
-
- HERE = Path(".")
- YEAR = "2024"
- CACHE_PATH = HERE / "cache" / YEAR
- LOCAL_DOMAIN = "http://larlet.test:3579"
-
-
- environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
-
-
- def parse_markdown(file_path):
- """Extract title, (HTML) content and metadata from a markdown file."""
- parser = markdown.Markdown(extensions=["meta"])
- with codecs.open(file_path, "r") as source:
- content = parser.convert(source.read())
- metadata = parser.Meta if hasattr(parser, "Meta") else None
- title = metadata["title"][0] if metadata is not None else ""
- return title, content, metadata
-
-
- def each_markdown_from(source_dir, file_name="index.md"):
- """Walk across the `source_dir` and return the md file paths."""
- for root, dirnames, filenames in os.walk(source_dir):
- for filename in fnmatch.filter(filenames, file_name):
- yield os.path.join(root, filename)
-
-
- @dataclass
- class Cache:
- title: str
- content: str
- url: str
- hash_url: str
- archive_date: str
- og_image: str
- description: str
-
- @staticmethod
- def all(source_dir=CACHE_PATH):
- for file_path in each_markdown_from(source_dir):
- title, content, metadata = parse_markdown(file_path)
- url = metadata["url"][0]
- hash_url = metadata["hash_url"][0]
- archive_date = metadata["archive_date"][0]
- og_image = metadata.get("og_image", [""])[0]
- description = metadata.get("description", [""])[0]
- yield Cache(
- title, content, url, hash_url, archive_date, og_image, description
- )
-
- @staticmethod
- def one(hash_url):
- return next(Cache.all(source_dir=CACHE_PATH / hash_url))
-
-
- def extract_page(url):
- """From an URL, extract title and content using Readability.
-
- The title is shortened through the `short_title` native method.
- The content doesn't contain `<body>` tags to be directly
- embeddable in the template and rendered as is.
- """
- # Retrieves the resource and turns it into a Readability doc.
- response = httpx.get(url)
- document = Document(response.text)
-
- # The short title is more concise and readable.
- title = document.short_title()
- content = document.summary(html_partial=True)
-
- # Removing the added <div> and spaces.
- content = content[5:-6].strip()
- return title, content
-
-
- def create(hash_url):
- """Turn new MD file into HTML file."""
- template = environment.get_template("cache_article.html")
- cache = Cache.one(hash_url)
- page = template.render(cache=cache)
- cache_target = CACHE_PATH / hash_url
- if not os.path.exists(cache_target):
- os.makedirs(cache_target)
- open(cache_target / "index.html", "w").write(page)
- print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")
-
-
- def save(
- title, content, url, hash_url, archive_date, cache_path, og_image, description
- ):
- template = environment.get_template("cache_article.md")
- page = template.render(
- title=title,
- content=content,
- url=url,
- hash_url=hash_url,
- archive_date=archive_date,
- og_image=og_image,
- description=description,
- )
- result_path = os.path.join(cache_path, "index.md")
- open(result_path, "w").write(page)
- return result_path
-
-
- @cli
- def generate():
- """Generate caches MD files into HTML files."""
- cache_list = []
- template = environment.get_template("cache_article.html")
- for cache in Cache.all():
- page = template.render(cache=cache)
- open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
- cache_list.append(cache)
-
- template = environment.get_template("cache_archives.html")
- page = template.render(cache_list=cache_list)
- open(CACHE_PATH / "index.html", "w").write(page)
- print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")
-
-
- def fetch_metadata(title, url, description):
- """Fetch additional metadata."""
- data = lassie.fetch(url)
- og_image = ""
- for image in data.get("images"):
- if image.get("type") == "og:image":
- og_image = image["src"]
- break
- if data.get("title") != title:
- print(data.get("title"), "vs.", title, url)
- description = description or data.get("description", "")
- return og_image, description
-
-
- @cli
- def metadata():
- """Fetch additional metadata for existing archives."""
- for cache in Cache.all():
- if cache.description or cache.url.startswith(
- (
- "https://www.la-grange.net",
- "https://tw5.immateriel.fr",
- "https://gilest.org",
- "https://vasilis.nl",
- "https://www.danmcquillan.org",
- "https://public-inbox.org",
- )
- ):
- print("Skipping", cache.url)
- continue
- print("Fetching metadata for", cache.url, cache.title)
- og_image, description = fetch_metadata(
- cache.title, cache.url, cache.description
- )
- save(
- cache.title,
- cache.content,
- cache.url,
- cache.hash_url,
- cache.archive_date,
- os.path.join(CACHE_PATH, cache.hash_url),
- og_image,
- description,
- )
-
-
- @cli
- def new(url):
- """Turn the given URL into a MD and a HTML files.
-
- :url: The URL of the page to put into cache.
- """
- hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
- try:
- title, content = extract_page(url)
- except (
- lxml.etree.XMLSyntaxError,
- httpx.HTTPError,
- httpx.ReadTimeout,
- ) as e:
- print(f"WARNING: {e}")
- title, content = "", ""
- cache_path = os.path.join(CACHE_PATH, hash_url)
- if not os.path.exists(cache_path):
- os.makedirs(cache_path)
- archive_date = date.today()
- # Caching a markdown file.
- og_image, description = fetch_metadata(title, url, "")
- result_path = save(
- title, content, url, hash_url, archive_date, cache_path, og_image, description
- )
- # Generating the HTML file.
- create(hash_url)
- md_line = f"> <cite>*[{title}]({url})*</cite>"
- print(md_line)
- os.popen(f'subl "{result_path}"')
-
-
- @wrap
- def perf_wrapper():
- start = perf_counter()
- yield
- elapsed = perf_counter() - start
- print(f"Done in {elapsed:.5f} seconds.")
-
-
- if __name__ == "__main__":
- run()
|