|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302 |
- #!/usr/bin/env python3
-
- import codecs
- import fnmatch
- import hashlib
- import os
- from dataclasses import dataclass
- from datetime import date
- from pathlib import Path
- from time import perf_counter
- from urllib.parse import urlparse
-
- import httpx
- import lassie
- import lxml
- import markdown
- from jinja2 import Environment as Env
- from jinja2 import FileSystemLoader
- from minicli import cli, run, wrap
- from readability.readability import Document
-
- HERE = Path(".")
- YEAR = "2024"
- CACHE_PATH = HERE / "cache" / YEAR
- LOCAL_DOMAIN = "http://larlet.test:3579"
-
-
- environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
-
-
- def parse_markdown(file_path):
- """Extract title, (HTML) content and metadata from a markdown file."""
- parser = markdown.Markdown(extensions=["meta"])
- with codecs.open(file_path, "r") as source:
- content = parser.convert(source.read())
- metadata = parser.Meta if hasattr(parser, "Meta") else None
- title = metadata["title"][0] if metadata is not None else ""
- return title, content, metadata
-
-
- def each_markdown_from(source_dir, file_name="index.md"):
- """Walk across the `source_dir` and return the md file paths."""
- for root, dirnames, filenames in os.walk(source_dir):
- for filename in fnmatch.filter(filenames, file_name):
- yield os.path.join(root, filename)
-
-
- @dataclass
- class Cache:
- title: str
- content: str
- url: str
- hash_url: str
- archive_date: str
- og_image: str
- description: str
- favicon: str
- language: str
-
- @staticmethod
- def all(source_dir=CACHE_PATH):
- for file_path in each_markdown_from(source_dir):
- title, content, metadata = parse_markdown(file_path)
- url = metadata["url"][0]
- hash_url = metadata["hash_url"][0]
- archive_date = metadata["archive_date"][0]
- og_image = metadata.get("og_image", [""])[0]
- description = metadata.get("description", [""])[0]
- favicon = metadata.get("favicon", [""])[0]
- language = metadata.get("language", [""])[0]
- yield Cache(
- title,
- content,
- url,
- hash_url,
- archive_date,
- og_image,
- description,
- favicon,
- language,
- )
-
- @staticmethod
- def one(hash_url):
- return next(Cache.all(source_dir=CACHE_PATH / hash_url))
-
-
- def extract_page(url):
- """From an URL, extract title and content using Readability.
-
- The title is shortened through the `short_title` native method.
- The content doesn't contain `<body>` tags to be directly
- embeddable in the template and rendered as is.
- """
- # Retrieves the resource and turns it into a Readability doc.
- response = httpx.get(url)
- document = Document(response.text)
-
- # The short title is more concise and readable.
- title = document.short_title()
- content = document.summary(html_partial=True)
-
- # Removing the added <div> and spaces.
- content = content[5:-6].strip()
- return title, content
-
-
- def create(hash_url):
- """Turn new MD file into HTML file."""
- template = environment.get_template("cache_article.html")
- cache = Cache.one(hash_url)
- page = template.render(cache=cache)
- cache_target = CACHE_PATH / hash_url
- if not os.path.exists(cache_target):
- os.makedirs(cache_target)
- open(cache_target / "index.html", "w").write(page)
- print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")
-
-
- def save(
- title,
- content,
- url,
- hash_url,
- archive_date,
- cache_path,
- og_image,
- description,
- favicon,
- language,
- ):
- template = environment.get_template("cache_article.md")
- page = template.render(
- title=title,
- content=content,
- url=url,
- hash_url=hash_url,
- archive_date=archive_date,
- og_image=og_image,
- description=description,
- favicon=favicon,
- language=language,
- )
- result_path = os.path.join(cache_path, "index.md")
- open(result_path, "w").write(page)
- return result_path
-
-
- @cli
- def generate():
- """Generate caches MD files into HTML files."""
- cache_list = []
- template = environment.get_template("cache_article.html")
- for cache in Cache.all():
- page = template.render(cache=cache)
- open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
- cache_list.append(cache)
-
- template = environment.get_template("cache_archives.html")
- page = template.render(cache_list=cache_list)
- open(CACHE_PATH / "index.html", "w").write(page)
- print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")
-
-
- def fetch_metadata(title, url, description, language):
- """Fetch additional metadata."""
- parsed_url = urlparse(url)
- root_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
- data = lassie.fetch(url, all_images=True)
- og_image = ""
- favicon = ""
- for image in data.get("images"):
- image_type = image.get("type")
- image_src = image.get("src")
- if image_src == root_url:
- og_image = ""
- continue
- elif image_type == "og:image" and not og_image:
- og_image = image_src
- continue
- elif image_type == "twitter:image" and not og_image:
- og_image = image_src
- continue
- elif image_type == "favicon":
- if not favicon:
- favicon = image_src
- elif ".ico" in favicon and ".ico" not in image_src:
- favicon = image_src
-
- if not og_image:
- for image in data.get("images"):
- if image_type == "body_image":
- image_src = image.get("src")
- if image_src and "favicon" not in image_src:
- og_image = image_src
- break
-
- # Fallback on server's default.
- if not favicon:
- favico_url = f"{root_url}favicon.ico"
- response = httpx.get(favico_url)
- if response.status_code == 200:
- favicon = favico_url
-
- if data.get("title") != title:
- print(data.get("title"), "vs.", title, url)
- description = description or data.get("description", "")
- language = language or data.get("locale", "")
- return og_image, description, favicon, language
-
-
- @cli
- def metadata():
- """Fetch additional metadata for existing archives."""
- for cache in Cache.all():
- # That one is taking way too long.
- if cache.url.startswith("https://tw5.immateriel.fr"):
- print("Skipping (too long)", cache.url)
- continue
- if cache.og_image and cache.description and cache.favicon and cache.language:
- print("Skipping (all good)", cache.url)
- continue
- if cache.url.startswith(
- (
- "https://www.la-grange.net",
- "https://gilest.org",
- "https://vasilis.nl",
- "https://www.danmcquillan.org",
- "https://public-inbox.org",
- )
- ) and (cache.og_image or cache.description or cache.favicon or cache.language):
- print("Skipping (known missing infos)", cache.url)
- continue
- print("Fetching metadata for", cache.url, cache.title)
- og_image, description, favicon, language = fetch_metadata(
- cache.title, cache.url, cache.description, cache.language
- )
- save(
- cache.title,
- cache.content,
- cache.url,
- cache.hash_url,
- cache.archive_date,
- os.path.join(CACHE_PATH, cache.hash_url),
- og_image,
- description,
- favicon,
- language,
- )
-
-
- @cli
- def new(url):
- """Turn the given URL into a MD and a HTML files.
-
- :url: The URL of the page to put into cache.
- """
- hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
- try:
- title, content = extract_page(url)
- except (
- lxml.etree.XMLSyntaxError,
- httpx.HTTPError,
- httpx.ReadTimeout,
- ) as e:
- print(f"WARNING: {e}")
- title, content = "", ""
- cache_path = os.path.join(CACHE_PATH, hash_url)
- if not os.path.exists(cache_path):
- os.makedirs(cache_path)
- archive_date = date.today()
- # Caching a markdown file.
- og_image, description, favicon, language = fetch_metadata(title, url, "", "")
- result_path = save(
- title,
- content,
- url,
- hash_url,
- archive_date,
- cache_path,
- og_image,
- description,
- favicon,
- language,
- )
- # Generating the HTML file.
- create(hash_url)
- md_line = f"> <cite>*[{title}]({url})*</cite>"
- print(md_line)
- os.popen(f'subl "{result_path}"')
-
-
- @wrap
- def perf_wrapper():
- start = perf_counter()
- yield
- elapsed = perf_counter() - start
- print(f"Done in {elapsed:.5f} seconds.")
-
-
- if __name__ == "__main__":
- run()
|