A place to cache linked articles (think custom and personal wayback machine)
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/env python3
  2. import codecs
  3. import fnmatch
  4. import hashlib
  5. import os
  6. from dataclasses import dataclass
  7. from pathlib import Path
  8. from time import perf_counter
  9. import httpx
  10. import lxml
  11. import markdown
  12. from jinja2 import Environment as Env
  13. from jinja2 import FileSystemLoader
  14. from minicli import cli, run, wrap
  15. from readability.readability import Document
  16. HERE = Path(".")
  17. CACHE_PATH = HERE / "cache" / "2021"
  18. LOCAL_DOMAIN = "http://larlet.test:3579"
  19. environment = Env(loader=FileSystemLoader(str(HERE / "templates")))
  20. def parse_markdown(file_path):
  21. """Extract title, (HTML) content and metadata from a markdown file."""
  22. parser = markdown.Markdown(extensions=["meta"])
  23. with codecs.open(file_path, "r") as source:
  24. content = parser.convert(source.read())
  25. metadata = parser.Meta if hasattr(parser, "Meta") else None
  26. title = metadata["title"][0] if metadata is not None else ""
  27. return title, content, metadata
  28. def each_markdown_from(source_dir, file_name="index.md"):
  29. """Walk across the `source_dir` and return the md file paths."""
  30. for root, dirnames, filenames in os.walk(source_dir):
  31. for filename in fnmatch.filter(filenames, file_name):
  32. yield os.path.join(root, filename)
  33. @dataclass
  34. class Cache:
  35. title: str
  36. content: str
  37. url: str
  38. hash_url: str
  39. @staticmethod
  40. def all(source_dir=CACHE_PATH):
  41. for file_path in each_markdown_from(source_dir):
  42. title, content, metadata = parse_markdown(file_path)
  43. url = metadata["url"][0]
  44. hash_url = metadata["hash_url"][0]
  45. yield Cache(title, content, url, hash_url)
  46. @staticmethod
  47. def one(hash_url):
  48. return next(Cache.all(source_dir=CACHE_PATH / hash_url))
  49. def extract_page(url):
  50. """From an URL, extract title and content using Readability.
  51. The title is shortened through the `short_title` native method.
  52. The content doesn't contain `<body>` tags to be directly
  53. embeddable in the template and rendered as is.
  54. """
  55. # Retrieves the resource and turns it into a Readability doc.
  56. response = httpx.get(url)
  57. document = Document(response.text)
  58. # The short title is more concise and readable.
  59. title = document.short_title()
  60. content = document.summary(html_partial=True)
  61. # Removing the added <div> and spaces.
  62. content = content[5:-6].strip()
  63. return title, content
  64. def create(hash_url):
  65. """Turn new MD file into HTML file."""
  66. template = environment.get_template("cache_article.html")
  67. cache = Cache.one(hash_url)
  68. page = template.render(cache=cache)
  69. cache_target = CACHE_PATH / hash_url
  70. if not os.path.exists(cache_target):
  71. os.makedirs(cache_target)
  72. open(cache_target / "index.html", "w").write(page)
  73. print(f"Done: {LOCAL_DOMAIN}/david/cache/2021/{hash_url}/")
  74. @cli
  75. def generate():
  76. """Generate caches MD files into HTML files."""
  77. cache_list = []
  78. template = environment.get_template("cache_article.html")
  79. for cache in Cache.all():
  80. page = template.render(cache=cache)
  81. open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
  82. cache_list.append(cache)
  83. template = environment.get_template("cache_archives.html")
  84. page = template.render(cache_list=cache_list)
  85. open(CACHE_PATH / "index.html", "w").write(page)
  86. print(f"Done: {LOCAL_DOMAIN}/david/cache/2021/")
  87. @cli
  88. def new(url):
  89. """Turn the given URL into a MD and a HTML files.
  90. :url: The URL of the page to put into cache.
  91. """
  92. hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
  93. url_cache = f"/david/cache/2021/{hash_url}/"
  94. link_line = f"]({url}) ([cache]({url_cache}))"
  95. print(link_line)
  96. try:
  97. title, content = extract_page(url)
  98. except (
  99. lxml.etree.XMLSyntaxError,
  100. httpx.HTTPError,
  101. httpx.ReadTimeout,
  102. ) as e:
  103. print(f"WARNING: {e}")
  104. title, content = "", ""
  105. cache_path = os.path.join(CACHE_PATH, hash_url)
  106. if not os.path.exists(cache_path):
  107. os.makedirs(cache_path)
  108. # Caching a markdown file.
  109. template = environment.get_template("cache_article.md")
  110. page = template.render(title=title, content=content, url=url, hash_url=hash_url)
  111. result_path = os.path.join(cache_path, "index.md")
  112. open(result_path, "w").write(page)
  113. # Generating the HTML file.
  114. create(hash_url)
  115. md_line = f"> <cite>*[{title}]({url})* ([cache]({url_cache}))</cite>"
  116. print(md_line)
  117. os.popen(f'subl "{result_path}"')
  118. return md_line
  119. @wrap
  120. def perf_wrapper():
  121. start = perf_counter()
  122. yield
  123. elapsed = perf_counter() - start
  124. print(f"Done in {elapsed:.5f} seconds.")
  125. if __name__ == "__main__":
  126. run()