4 years ago · c4670345c2
--- a/cache.py
+++ b/cache.py
 #!/usr/bin/env python3
 import codecs
 import fnmatch
 import hashlib
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from time import perf_counter
 import httpx
 import lxml
 import markdown
 from jinja2 import Environment as Env
 from jinja2 import FileSystemLoader
 from minicli import cli, run, wrap
 from readability.readability import Document
 HERE = Path(".")
 DAVID = HERE / "david"
 CACHE_PATH = DAVID / "cache"
 DOMAIN = "https://larlet.fr"
 environment = Env(loader=FileSystemLoader(str(DAVID / "templates")))
 def parse_markdown(file_path):
    """Extract title, (HTML) content and metadata from a markdown file."""
    parser = markdown.Markdown(extensions=["meta"])
    with codecs.open(file_path, "r") as source:
        content = parser.convert(source.read())
        metadata = parser.Meta if hasattr(parser, "Meta") else None
        title = metadata["title"][0] if metadata is not None else ""
        return title, content, metadata
 def each_markdown_from(source_dir, file_name="index.md"):
    """Walk across the `source_dir` and return the md file paths."""
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, file_name):
            yield os.path.join(root, filename)
@dataclass
 class Cache:
    title: str
    content: str
    url: str
    hash_url: str
    @staticmethod
    def all(source_dir=CACHE_PATH):
        for file_path in each_markdown_from(source_dir):
            title, content, metadata = parse_markdown(file_path)
            url = metadata["url"][0]
            hash_url = metadata["hash_url"][0]
            yield Cache(title, content, url, hash_url)
    @staticmethod
    def one(hash_url):
        return next(Cache.all(source_dir=CACHE_PATH / hash_url))
 def extract_page(url):
    """From an URL, extract title and content using Readability.
    The title is shortened through the `short_title` native method.
    The content doesn't contain `<body>` tags to be directly
    embeddable in the template and rendered as is.
    """
    # Retrieves the resource and turns it into a Readability doc.
    response = httpx.get(url)
    document = Document(response.text)
    # The short title is more concise and readable.
    title = document.short_title()
    content = document.summary(html_partial=True)
    # Removing the added <div> and spaces.
    content = content[5:-6].strip()
    return title, content
 def create(hash_url):
    """Turn new MD file into HTML file."""
    template = environment.get_template("cache_article.html")
    cache = Cache.one(hash_url)
    page = template.render(cache=cache)
    cache_target = CACHE_PATH / hash_url
    if not os.path.exists(cache_target):
        os.makedirs(cache_target)
    open(cache_target / "index.html", "w").write(page)
    print(f"Done: http://larlet.test:8001/david/cache/{hash_url}/")
@cli
 def generate():
    """Generate caches MD files into HTML files."""
    caches = []
    template = environment.get_template("cache_article.html")
    for cache in Cache.all():
        page = template.render(cache=cache)
        open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
        caches.append(cache)
    template = environment.get_template("cache_archives.html")
    page = template.render(caches=caches)
    open(CACHE_PATH / "index.html", "w").write(page)
    print("Done: http://larlet.test:8001/david/cache/")
@cli
 def new(url):
    """Turn the given URL into a MD and a HTML files.
    :url: The URL of the page to put into cache.
    """
    hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
    url_cache = f"/david/cache/{hash_url}/"
    link_line = f"]({url}) ([cache]({url_cache}))"
    print(link_line)
    try:
        title, content = extract_page(url)
    except (lxml.etree.XMLSyntaxError, httpx.exceptions.HTTPError,) as e:
        print(f"WARNING: {e}")
        title, content = "", ""
    cache_path = os.path.join(CACHE_PATH, hash_url)
    if not os.path.exists(cache_path):
        os.makedirs(cache_path)
    # Caching a markdown file.
    template = environment.get_template("cache_article.md")
    page = template.render(title=title, content=content, url=url, hash_url=hash_url)
    result_path = os.path.join(cache_path, "index.md")
    open(result_path, "w").write(page)
    # Generating the HTML file.
    create(hash_url)
    md_line = f"> <cite>*[{title}]({url})* ([cache]({url_cache}))</cite>"
    print(md_line)
    os.popen(f'subl "{result_path}"')
    return md_line
@wrap
 def perf_wrapper():
    start = perf_counter()
    yield
    elapsed = perf_counter() - start
    print(f"Done in {elapsed:.5f} seconds.")
 if __name__ == "__main__":
    run()
--- a/david/templates/cache_archives.html
+++ b/david/templates/cache_archives.html
 {% extends "base_2019.html" %}
 {% block title %}Articles archivés{% endblock %}
 {% block extra_head %}
  <meta name="robots" content="noindex, nofollow">
  <!-- Canonical URL for SEO purposes -->
  <link rel="canonical" href="https://larlet.fr/david/cache/">
 {% endblock %}
 {% block content %}
 <h1>
  <span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
  Articles archivés
  <time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
 </h1>
 <section>
  <article>
    <ul>
      {% for cache in caches %}
        <li><a href="/david/cache/{{ cache.hash_url }}/" title="Accès à l'article caché">{{ cache.title }}</a> (<a href="{{ cache.url }}" title="Accès à l'article original">original</a>)</li>
      {% endfor %}
    </ul>
  </article>
 </section>
 {% endblock content %}
 {% block nav %}
 <nav id="jumpto">
  <p>
    <a href="/david/blog/">Accueil du blog</a> |
    <a href="/david/stream/2019/">Accueil du flux</a>
  </p>
 </nav>
 {% endblock nav %}
--- a/david/templates/cache_article.html
+++ b/david/templates/cache_article.html
 {% extends "base_2019.html" %}
 {% block title %}{{ cache.title }} (archive){% endblock %}
 {% block extra_head %}
  <meta name="robots" content="noindex, nofollow">
  <meta content="origin-when-cross-origin" name="referrer">
  <!-- Canonical URL for SEO purposes -->
  <link rel="canonical" href="{{ cache.url }}">
 {% endblock %}
 {% block content %}
 <h1>
  <span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
  {{ cache.title }} (archive)
  <time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
 </h1>
 <section>
  <article>
    <h3><a href="{{ cache.url }}">Source originale du contenu</a></h3>
    {{ cache.content }}
  </article>
 </section>
 {% endblock content %}
 {% block nav %}
 <nav id="jumpto">
  <p>
    <a href="/david/blog/">Accueil du blog</a> |
    <a href="{{ cache.url }}">Source originale</a> |
    <a href="/david/stream/2019/">Accueil du flux</a>
  </p>
 </nav>
 {% endblock nav %}
--- a/david/templates/cache_article.md
+++ b/david/templates/cache_article.md
 title: {{ title }}
 url: {{ url }}
 hash_url: {{ hash_url }}
 {{ content }}
--- a/requirements.txt
+++ b/requirements.txt
 Jinja2==2.10.3
 Markdown==3.1.1
 httpx==0.7.8
 minicli==0.4.4
 readability-lxml==0.7.1