A place to cache linked articles (think custom and personal wayback machine)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. #!/usr/bin/env python3
  2. import codecs
  3. import fnmatch
  4. import hashlib
  5. import os
  6. from dataclasses import dataclass
  7. from datetime import date
  8. from pathlib import Path
  9. from time import perf_counter
  10. from urllib.parse import urlparse
  11. import httpx
  12. import lassie
  13. import lxml
  14. import markdown
  15. from jinja2 import Environment as Env
  16. from jinja2 import FileSystemLoader
  17. from minicli import cli, run, wrap
  18. from readability.readability import Document
  19. HERE = Path(".")
  20. YEAR = "2024"
  21. CACHE_PATH = HERE / "cache" / YEAR
  22. LOCAL_DOMAIN = "http://larlet.test:3579"
  23. environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
  24. def parse_markdown(file_path):
  25. """Extract title, (HTML) content and metadata from a markdown file."""
  26. parser = markdown.Markdown(extensions=["meta"])
  27. with codecs.open(file_path, "r") as source:
  28. content = parser.convert(source.read())
  29. metadata = parser.Meta if hasattr(parser, "Meta") else None
  30. title = metadata["title"][0] if metadata is not None else ""
  31. return title, content, metadata
  32. def each_markdown_from(source_dir, file_name="index.md"):
  33. """Walk across the `source_dir` and return the md file paths."""
  34. for root, dirnames, filenames in os.walk(source_dir):
  35. for filename in fnmatch.filter(filenames, file_name):
  36. yield os.path.join(root, filename)
  37. @dataclass
  38. class Cache:
  39. title: str
  40. content: str
  41. url: str
  42. hash_url: str
  43. archive_date: str
  44. og_image: str
  45. description: str
  46. favicon: str
  47. language: str
  48. @staticmethod
  49. def all(source_dir=CACHE_PATH):
  50. for file_path in each_markdown_from(source_dir):
  51. title, content, metadata = parse_markdown(file_path)
  52. url = metadata["url"][0]
  53. hash_url = metadata["hash_url"][0]
  54. archive_date = metadata["archive_date"][0]
  55. og_image = metadata.get("og_image", [""])[0]
  56. description = metadata.get("description", [""])[0]
  57. favicon = metadata.get("favicon", [""])[0]
  58. language = metadata.get("language", [""])[0]
  59. yield Cache(
  60. title,
  61. content,
  62. url,
  63. hash_url,
  64. archive_date,
  65. og_image,
  66. description,
  67. favicon,
  68. language,
  69. )
  70. @staticmethod
  71. def one(hash_url):
  72. return next(Cache.all(source_dir=CACHE_PATH / hash_url))
  73. def extract_page(url):
  74. """From an URL, extract title and content using Readability.
  75. The title is shortened through the `short_title` native method.
  76. The content doesn't contain `<body>` tags to be directly
  77. embeddable in the template and rendered as is.
  78. """
  79. # Retrieves the resource and turns it into a Readability doc.
  80. response = httpx.get(url)
  81. document = Document(response.text)
  82. # The short title is more concise and readable.
  83. title = document.short_title()
  84. content = document.summary(html_partial=True)
  85. # Removing the added <div> and spaces.
  86. content = content[5:-6].strip()
  87. return title, content
  88. def create(hash_url):
  89. """Turn new MD file into HTML file."""
  90. template = environment.get_template("cache_article.html")
  91. cache = Cache.one(hash_url)
  92. page = template.render(cache=cache)
  93. cache_target = CACHE_PATH / hash_url
  94. if not os.path.exists(cache_target):
  95. os.makedirs(cache_target)
  96. open(cache_target / "index.html", "w").write(page)
  97. print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")
  98. def save(
  99. title,
  100. content,
  101. url,
  102. hash_url,
  103. archive_date,
  104. cache_path,
  105. og_image,
  106. description,
  107. favicon,
  108. language,
  109. ):
  110. template = environment.get_template("cache_article.md")
  111. page = template.render(
  112. title=title,
  113. content=content,
  114. url=url,
  115. hash_url=hash_url,
  116. archive_date=archive_date,
  117. og_image=og_image,
  118. description=description,
  119. favicon=favicon,
  120. language=language,
  121. )
  122. result_path = os.path.join(cache_path, "index.md")
  123. open(result_path, "w").write(page)
  124. return result_path
  125. @cli
  126. def generate():
  127. """Generate caches MD files into HTML files."""
  128. cache_list = []
  129. template = environment.get_template("cache_article.html")
  130. for cache in Cache.all():
  131. page = template.render(cache=cache)
  132. open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
  133. cache_list.append(cache)
  134. template = environment.get_template("cache_archives.html")
  135. page = template.render(cache_list=cache_list)
  136. open(CACHE_PATH / "index.html", "w").write(page)
  137. print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")
  138. def fetch_metadata(title, url, description, language):
  139. """Fetch additional metadata."""
  140. parsed_url = urlparse(url)
  141. root_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
  142. data = lassie.fetch(url, all_images=True)
  143. og_image = ""
  144. favicon = ""
  145. for image in data.get("images"):
  146. image_type = image.get("type")
  147. image_src = image.get("src")
  148. if image_src == root_url:
  149. og_image = ""
  150. continue
  151. elif image_type == "og:image" and not og_image:
  152. og_image = image_src
  153. continue
  154. elif image_type == "twitter:image" and not og_image:
  155. og_image = image_src
  156. continue
  157. elif image_type == "favicon":
  158. if not favicon:
  159. favicon = image_src
  160. elif ".ico" in favicon and ".ico" not in image_src:
  161. favicon = image_src
  162. if not og_image:
  163. for image in data.get("images"):
  164. if image_type == "body_image":
  165. image_src = image.get("src")
  166. if image_src and "favicon" not in image_src:
  167. og_image = image_src
  168. break
  169. # Fallback on server's default.
  170. if not favicon:
  171. favico_url = f"{root_url}favicon.ico"
  172. response = httpx.get(favico_url)
  173. if response.status_code == 200:
  174. favicon = favico_url
  175. if data.get("title") != title:
  176. print(data.get("title"), "vs.", title, url)
  177. description = description or data.get("description", "")
  178. language = language or data.get("locale", "")
  179. return og_image, description, favicon, language
  180. @cli
  181. def metadata():
  182. """Fetch additional metadata for existing archives."""
  183. for cache in Cache.all():
  184. # That one is taking way too long.
  185. if cache.url.startswith("https://tw5.immateriel.fr"):
  186. print("Skipping (too long)", cache.url)
  187. continue
  188. if cache.og_image and cache.description and cache.favicon and cache.language:
  189. print("Skipping (all good)", cache.url)
  190. continue
  191. if cache.url.startswith(
  192. (
  193. "https://www.la-grange.net",
  194. "https://gilest.org",
  195. "https://vasilis.nl",
  196. "https://www.danmcquillan.org",
  197. "https://public-inbox.org",
  198. )
  199. ) and (cache.og_image or cache.description or cache.favicon or cache.language):
  200. print("Skipping (known missing infos)", cache.url)
  201. continue
  202. print("Fetching metadata for", cache.url, cache.title)
  203. og_image, description, favicon, language = fetch_metadata(
  204. cache.title, cache.url, cache.description, cache.language
  205. )
  206. save(
  207. cache.title,
  208. cache.content,
  209. cache.url,
  210. cache.hash_url,
  211. cache.archive_date,
  212. os.path.join(CACHE_PATH, cache.hash_url),
  213. og_image,
  214. description,
  215. favicon,
  216. language,
  217. )
  218. @cli
  219. def new(url):
  220. """Turn the given URL into a MD and a HTML files.
  221. :url: The URL of the page to put into cache.
  222. """
  223. hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
  224. try:
  225. title, content = extract_page(url)
  226. except (
  227. lxml.etree.XMLSyntaxError,
  228. httpx.HTTPError,
  229. httpx.ReadTimeout,
  230. ) as e:
  231. print(f"WARNING: {e}")
  232. title, content = "", ""
  233. cache_path = os.path.join(CACHE_PATH, hash_url)
  234. if not os.path.exists(cache_path):
  235. os.makedirs(cache_path)
  236. archive_date = date.today()
  237. # Caching a markdown file.
  238. og_image, description, favicon, language = fetch_metadata(title, url, "", "")
  239. result_path = save(
  240. title,
  241. content,
  242. url,
  243. hash_url,
  244. archive_date,
  245. cache_path,
  246. og_image,
  247. description,
  248. favicon,
  249. language,
  250. )
  251. # Generating the HTML file.
  252. create(hash_url)
  253. md_line = f"> <cite>*[{title}]({url})*</cite>"
  254. print(md_line)
  255. os.popen(f'subl "{result_path}"')
  256. @wrap
  257. def perf_wrapper():
  258. start = perf_counter()
  259. yield
  260. elapsed = perf_counter() - start
  261. print(f"Done in {elapsed:.5f} seconds.")
  262. if __name__ == "__main__":
  263. run()