Browse Source

Externalize cache to a dedicated repository

master
David Larlet 4 years ago
parent
commit
c4670345c2
No known key found for this signature in database

+ 0
- 153
cache.py View File

#!/usr/bin/env python3

import codecs
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from pathlib import Path
from time import perf_counter

import httpx
import lxml
import markdown
from jinja2 import Environment as Env
from jinja2 import FileSystemLoader
from minicli import cli, run, wrap
from readability.readability import Document

HERE = Path(".")
DAVID = HERE / "david"
CACHE_PATH = DAVID / "cache"
DOMAIN = "https://larlet.fr"


environment = Env(loader=FileSystemLoader(str(DAVID / "templates")))


def parse_markdown(file_path):
"""Extract title, (HTML) content and metadata from a markdown file."""
parser = markdown.Markdown(extensions=["meta"])
with codecs.open(file_path, "r") as source:
content = parser.convert(source.read())
metadata = parser.Meta if hasattr(parser, "Meta") else None
title = metadata["title"][0] if metadata is not None else ""
return title, content, metadata


def each_markdown_from(source_dir, file_name="index.md"):
"""Walk across the `source_dir` and return the md file paths."""
for root, dirnames, filenames in os.walk(source_dir):
for filename in fnmatch.filter(filenames, file_name):
yield os.path.join(root, filename)


@dataclass
class Cache:
title: str
content: str
url: str
hash_url: str

@staticmethod
def all(source_dir=CACHE_PATH):
for file_path in each_markdown_from(source_dir):
title, content, metadata = parse_markdown(file_path)
url = metadata["url"][0]
hash_url = metadata["hash_url"][0]
yield Cache(title, content, url, hash_url)

@staticmethod
def one(hash_url):
return next(Cache.all(source_dir=CACHE_PATH / hash_url))


def extract_page(url):
"""From an URL, extract title and content using Readability.

The title is shortened through the `short_title` native method.
The content doesn't contain `<body>` tags to be directly
embeddable in the template and rendered as is.
"""
# Retrieves the resource and turns it into a Readability doc.
response = httpx.get(url)
document = Document(response.text)

# The short title is more concise and readable.
title = document.short_title()
content = document.summary(html_partial=True)

# Removing the added <div> and spaces.
content = content[5:-6].strip()
return title, content


def create(hash_url):
"""Turn new MD file into HTML file."""
template = environment.get_template("cache_article.html")
cache = Cache.one(hash_url)
page = template.render(cache=cache)
cache_target = CACHE_PATH / hash_url
if not os.path.exists(cache_target):
os.makedirs(cache_target)
open(cache_target / "index.html", "w").write(page)
print(f"Done: http://larlet.test:8001/david/cache/{hash_url}/")


@cli
def generate():
"""Generate caches MD files into HTML files."""
caches = []
template = environment.get_template("cache_article.html")
for cache in Cache.all():
page = template.render(cache=cache)
open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page)
caches.append(cache)

template = environment.get_template("cache_archives.html")
page = template.render(caches=caches)
open(CACHE_PATH / "index.html", "w").write(page)
print("Done: http://larlet.test:8001/david/cache/")


@cli
def new(url):
"""Turn the given URL into a MD and a HTML files.

:url: The URL of the page to put into cache.
"""
hash_url = hashlib.md5(url.encode("utf-8")).hexdigest()
url_cache = f"/david/cache/{hash_url}/"
link_line = f"]({url}) ([cache]({url_cache}))"
print(link_line)
try:
title, content = extract_page(url)
except (lxml.etree.XMLSyntaxError, httpx.exceptions.HTTPError,) as e:
print(f"WARNING: {e}")
title, content = "", ""
cache_path = os.path.join(CACHE_PATH, hash_url)
if not os.path.exists(cache_path):
os.makedirs(cache_path)
# Caching a markdown file.
template = environment.get_template("cache_article.md")
page = template.render(title=title, content=content, url=url, hash_url=hash_url)
result_path = os.path.join(cache_path, "index.md")
open(result_path, "w").write(page)
# Generating the HTML file.
create(hash_url)
md_line = f"> <cite>*[{title}]({url})* ([cache]({url_cache}))</cite>"
print(md_line)
os.popen(f'subl "{result_path}"')
return md_line


@wrap
def perf_wrapper():
start = perf_counter()
yield
elapsed = perf_counter() - start
print(f"Done in {elapsed:.5f} seconds.")


if __name__ == "__main__":
run()

+ 0
- 31
david/templates/cache_archives.html View File

{% extends "base_2019.html" %}
{% block title %}Articles archivés{% endblock %}
{% block extra_head %}
<meta name="robots" content="noindex, nofollow">
<!-- Canonical URL for SEO purposes -->
<link rel="canonical" href="https://larlet.fr/david/cache/">
{% endblock %}
{% block content %}
<h1>
<span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
Articles archivés
<time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
</h1>
<section>
<article>
<ul>
{% for cache in caches %}
<li><a href="/david/cache/{{ cache.hash_url }}/" title="Accès à l'article caché">{{ cache.title }}</a> (<a href="{{ cache.url }}" title="Accès à l'article original">original</a>)</li>
{% endfor %}
</ul>
</article>
</section>
{% endblock content %}
{% block nav %}
<nav id="jumpto">
<p>
<a href="/david/blog/">Accueil du blog</a> |
<a href="/david/stream/2019/">Accueil du flux</a>
</p>
</nav>
{% endblock nav %}

+ 0
- 30
david/templates/cache_article.html View File

{% extends "base_2019.html" %}
{% block title %}{{ cache.title }} (archive){% endblock %}
{% block extra_head %}
<meta name="robots" content="noindex, nofollow">
<meta content="origin-when-cross-origin" name="referrer">
<!-- Canonical URL for SEO purposes -->
<link rel="canonical" href="{{ cache.url }}">
{% endblock %}
{% block content %}
<h1>
<span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
{{ cache.title }} (archive)
<time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
</h1>
<section>
<article>
<h3><a href="{{ cache.url }}">Source originale du contenu</a></h3>
{{ cache.content }}
</article>
</section>
{% endblock content %}
{% block nav %}
<nav id="jumpto">
<p>
<a href="/david/blog/">Accueil du blog</a> |
<a href="{{ cache.url }}">Source originale</a> |
<a href="/david/stream/2019/">Accueil du flux</a>
</p>
</nav>
{% endblock nav %}

+ 0
- 5
david/templates/cache_article.md View File

title: {{ title }}
url: {{ url }}
hash_url: {{ hash_url }}

{{ content }}

+ 0
- 2
requirements.txt View File

Jinja2==2.10.3 Jinja2==2.10.3
Markdown==3.1.1 Markdown==3.1.1
httpx==0.7.8
minicli==0.4.4 minicli==0.4.4
readability-lxml==0.7.1

Loading…
Cancel
Save