A place to cache linked articles (think custom and personal wayback machine)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cache.py 6.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #!/usr/bin/env python3
  2. import codecs
  3. import fnmatch
  4. import hashlib
  5. import os
  6. from dataclasses import dataclass
  7. from datetime import date
  8. from pathlib import Path
  9. from time import perf_counter
  10. import httpx
  11. import lassie
  12. import lxml
  13. import markdown
  14. from jinja2 import Environment as Env
  15. from jinja2 import FileSystemLoader
  16. from minicli import cli, run, wrap
  17. from readability.readability import Document
  18. HERE = Path(".")
  19. YEAR = "2024"
  20. CACHE_PATH = HERE / "cache" / YEAR
  21. LOCAL_DOMAIN = "http://larlet.test:3579"
  22. environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
  23. def parse_markdown(file_path):
  24. """Extract title, (HTML) content and metadata from a markdown file."""
  25. parser = markdown.Markdown(extensions=["meta"])
  26. with codecs.open(file_path, "r") as source:
  27. content = parser.convert(source.read())
  28. metadata = parser.Meta if hasattr(parser, "Meta") else None
  29. title = metadata["title"][0] if metadata is not None else ""
  30. return title, content, metadata
  31. def each_markdown_from(source_dir, file_name="index.md"):
  32. """Walk across the `source_dir` and return the md file paths."""
  33. for root, dirnames, filenames in os.walk(source_dir):
  34. for filename in fnmatch.filter(filenames, file_name):
  35. yield os.path.join(root, filename)
  36. @dataclass
  37. class Cache:
  38. title: str
  39. content: str
  40. url: str
  41. hash_url: str
  42. archive_date: str
  43. og_image: str
  44. description: str
  45. @staticmethod
  46. def all(source_dir=CACHE_PATH):
  47. for file_path in each_markdown_from(source_dir):
  48. title, content, metadata = parse_markdown(file_path)
  49. url = metadata["url"][0]
  50. hash_url = metadata["hash_url"][0]
  51. archive_date = metadata["archive_date"][0]
  52. og_image = metadata.get("og_image", [""])[0]
  53. description = metadata.get("description", [""])[0]
  54. yield Cache(
  55. title, content, url, hash_url, archive_date, og_image, description
  56. )
  57. @staticmethod
  58. def one(hash_url):
  59. return next(Cache.all(source_dir=CACHE_PATH / hash_url))
  60. def extract_page(url):
  61. """From an URL, extract title and content using Readability.
  62. The title is shortened through the `short_title` native method.
  63. The content doesn't contain `<body>` tags to be directly
  64. embeddable in the template and rendered as is.
  65. """
  66. # Retrieves the resource and turns it into a Readability doc.
  67. response = httpx.get(url)
  68. document = Document(response.text)
  69. # The short title is more concise and readable.
  70. title = document.short_title()
  71. content = document.summary(html_partial=True)
  72. # Removing the added <div> and spaces.
  73. content = content[5:-6].strip()
  74. return title, content
  75. def create(hash_url):
  76. """Turn new MD file into HTML file."""
  77. template = environment.get_template("cache_article.html")
  78. cache = Cache.one(hash_url)
  79. page = template.render(cache=cache)
  80. cache_target = CACHE_PATH / hash_url
  81. if not os.path.exists(cache_target):
  82. os.makedirs(cache_target)
  83. open(cache_target / "index.html", "w").write(page)
  84. print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/")
  85. def save(
  86. title, content, url, hash_url, archive_date, cache_path, og_image, description
  87. ):
  88. template = environment.get_template("cache_article.md")
  89. page = template.render(
  90. title=title,
  91. content=content,
  92. url=url,
  93. hash_url=hash_url,
  94. archive_date=archive_date,
  95. og_image=og_image,
  96. description=description,
  97. )
  98. result_path = os.path.join(cache_path, "index.md")
  99. open(result_path, "w").write(page)
  100. return result_path
  101. @cli
  102. def generate():
  103. """Generate caches MD files into HTML files."""
  104. cache_list = []
  105. template = environment.get_template("cache_article.html")
  106. for cache in Cache.all():
  107. page = template.render(cache=cache)
  108. open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
  109. cache_list.append(cache)
  110. template = environment.get_template("cache_archives.html")
  111. page = template.render(cache_list=cache_list)
  112. open(CACHE_PATH / "index.html", "w").write(page)
  113. print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/")
  114. def fetch_metadata(title, url, description):
  115. """Fetch additional metadata."""
  116. data = lassie.fetch(url)
  117. og_image = ""
  118. for image in data.get("images"):
  119. if image.get("type") == "og:image":
  120. og_image = image["src"]
  121. break
  122. if data.get("title") != title:
  123. print(data.get("title"), "vs.", title, url)
  124. description = description or data.get("description", "")
  125. return og_image, description
  126. @cli
  127. def metadata():
  128. """Fetch additional metadata for existing archives."""
  129. for cache in Cache.all():
  130. if cache.description or cache.url.startswith(
  131. (
  132. "https://www.la-grange.net",
  133. "https://tw5.immateriel.fr",
  134. "https://gilest.org",
  135. "https://vasilis.nl",
  136. "https://www.danmcquillan.org",
  137. "https://public-inbox.org",
  138. )
  139. ):
  140. print("Skipping", cache.url)
  141. continue
  142. print("Fetching metadata for", cache.url, cache.title)
  143. og_image, description = fetch_metadata(
  144. cache.title, cache.url, cache.description
  145. )
  146. save(
  147. cache.title,
  148. cache.content,
  149. cache.url,
  150. cache.hash_url,
  151. cache.archive_date,
  152. os.path.join(CACHE_PATH, cache.hash_url),
  153. og_image,
  154. description,
  155. )
  156. @cli
  157. def new(url):
  158. """Turn the given URL into a MD and a HTML files.
  159. :url: The URL of the page to put into cache.
  160. """
  161. hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
  162. try:
  163. title, content = extract_page(url)
  164. except (
  165. lxml.etree.XMLSyntaxError,
  166. httpx.HTTPError,
  167. httpx.ReadTimeout,
  168. ) as e:
  169. print(f"WARNING: {e}")
  170. title, content = "", ""
  171. cache_path = os.path.join(CACHE_PATH, hash_url)
  172. if not os.path.exists(cache_path):
  173. os.makedirs(cache_path)
  174. archive_date = date.today()
  175. # Caching a markdown file.
  176. og_image, description = fetch_metadata(title, url, "")
  177. result_path = save(
  178. title, content, url, hash_url, archive_date, cache_path, og_image, description
  179. )
  180. # Generating the HTML file.
  181. create(hash_url)
  182. md_line = f"> <cite>*[{title}]({url})*</cite>"
  183. print(md_line)
  184. os.popen(f'subl "{result_path}"')
  185. @wrap
  186. def perf_wrapper():
  187. start = perf_counter()
  188. yield
  189. elapsed = perf_counter() - start
  190. print(f"Done in {elapsed:.5f} seconds.")
  191. if __name__ == "__main__":
  192. run()