|
|
|
|
|
|
|
|
#!/usr/bin/env python3 |
|
|
|
|
|
|
|
|
|
|
|
import codecs |
|
|
|
|
|
import fnmatch |
|
|
|
|
|
import hashlib |
|
|
|
|
|
import os |
|
|
|
|
|
from dataclasses import dataclass |
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
from time import perf_counter |
|
|
|
|
|
|
|
|
|
|
|
import httpx |
|
|
|
|
|
import lxml |
|
|
|
|
|
import markdown |
|
|
|
|
|
from jinja2 import Environment as Env |
|
|
|
|
|
from jinja2 import FileSystemLoader |
|
|
|
|
|
from minicli import cli, run, wrap |
|
|
|
|
|
from readability.readability import Document |
|
|
|
|
|
|
|
|
|
|
|
HERE = Path(".") |
|
|
|
|
|
DAVID = HERE / "david" |
|
|
|
|
|
CACHE_PATH = DAVID / "cache" |
|
|
|
|
|
DOMAIN = "https://larlet.fr" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
environment = Env(loader=FileSystemLoader(str(DAVID / "templates"))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_markdown(file_path): |
|
|
|
|
|
"""Extract title, (HTML) content and metadata from a markdown file.""" |
|
|
|
|
|
parser = markdown.Markdown(extensions=["meta"]) |
|
|
|
|
|
with codecs.open(file_path, "r") as source: |
|
|
|
|
|
content = parser.convert(source.read()) |
|
|
|
|
|
metadata = parser.Meta if hasattr(parser, "Meta") else None |
|
|
|
|
|
title = metadata["title"][0] if metadata is not None else "" |
|
|
|
|
|
return title, content, metadata |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def each_markdown_from(source_dir, file_name="index.md"): |
|
|
|
|
|
"""Walk across the `source_dir` and return the md file paths.""" |
|
|
|
|
|
for root, dirnames, filenames in os.walk(source_dir): |
|
|
|
|
|
for filename in fnmatch.filter(filenames, file_name): |
|
|
|
|
|
yield os.path.join(root, filename) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Cache: |
|
|
|
|
|
title: str |
|
|
|
|
|
content: str |
|
|
|
|
|
url: str |
|
|
|
|
|
hash_url: str |
|
|
|
|
|
|
|
|
|
|
|
@staticmethod |
|
|
|
|
|
def all(source_dir=CACHE_PATH): |
|
|
|
|
|
for file_path in each_markdown_from(source_dir): |
|
|
|
|
|
title, content, metadata = parse_markdown(file_path) |
|
|
|
|
|
url = metadata["url"][0] |
|
|
|
|
|
hash_url = metadata["hash_url"][0] |
|
|
|
|
|
yield Cache(title, content, url, hash_url) |
|
|
|
|
|
|
|
|
|
|
|
@staticmethod |
|
|
|
|
|
def one(hash_url): |
|
|
|
|
|
return next(Cache.all(source_dir=CACHE_PATH / hash_url)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_page(url): |
|
|
|
|
|
"""From an URL, extract title and content using Readability. |
|
|
|
|
|
|
|
|
|
|
|
The title is shortened through the `short_title` native method. |
|
|
|
|
|
The content doesn't contain `<body>` tags to be directly |
|
|
|
|
|
embeddable in the template and rendered as is. |
|
|
|
|
|
""" |
|
|
|
|
|
# Retrieves the resource and turns it into a Readability doc. |
|
|
|
|
|
response = httpx.get(url) |
|
|
|
|
|
document = Document(response.text) |
|
|
|
|
|
|
|
|
|
|
|
# The short title is more concise and readable. |
|
|
|
|
|
title = document.short_title() |
|
|
|
|
|
content = document.summary(html_partial=True) |
|
|
|
|
|
|
|
|
|
|
|
# Removing the added <div> and spaces. |
|
|
|
|
|
content = content[5:-6].strip() |
|
|
|
|
|
return title, content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create(hash_url): |
|
|
|
|
|
"""Turn new MD file into HTML file.""" |
|
|
|
|
|
template = environment.get_template("cache_article.html") |
|
|
|
|
|
cache = Cache.one(hash_url) |
|
|
|
|
|
page = template.render(cache=cache) |
|
|
|
|
|
cache_target = CACHE_PATH / hash_url |
|
|
|
|
|
if not os.path.exists(cache_target): |
|
|
|
|
|
os.makedirs(cache_target) |
|
|
|
|
|
open(cache_target / "index.html", "w").write(page) |
|
|
|
|
|
print(f"Done: http://larlet.test:8001/david/cache/{hash_url}/") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli |
|
|
|
|
|
def generate(): |
|
|
|
|
|
"""Generate caches MD files into HTML files.""" |
|
|
|
|
|
caches = [] |
|
|
|
|
|
template = environment.get_template("cache_article.html") |
|
|
|
|
|
for cache in Cache.all(): |
|
|
|
|
|
page = template.render(cache=cache) |
|
|
|
|
|
open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) |
|
|
|
|
|
caches.append(cache) |
|
|
|
|
|
|
|
|
|
|
|
template = environment.get_template("cache_archives.html") |
|
|
|
|
|
page = template.render(caches=caches) |
|
|
|
|
|
open(CACHE_PATH / "index.html", "w").write(page) |
|
|
|
|
|
print("Done: http://larlet.test:8001/david/cache/") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli |
|
|
|
|
|
def new(url): |
|
|
|
|
|
"""Turn the given URL into a MD and a HTML files. |
|
|
|
|
|
|
|
|
|
|
|
:url: The URL of the page to put into cache. |
|
|
|
|
|
""" |
|
|
|
|
|
hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() |
|
|
|
|
|
url_cache = f"/david/cache/{hash_url}/" |
|
|
|
|
|
link_line = f"]({url}) ([cache]({url_cache}))" |
|
|
|
|
|
print(link_line) |
|
|
|
|
|
try: |
|
|
|
|
|
title, content = extract_page(url) |
|
|
|
|
|
except (lxml.etree.XMLSyntaxError, httpx.exceptions.HTTPError,) as e: |
|
|
|
|
|
print(f"WARNING: {e}") |
|
|
|
|
|
title, content = "", "" |
|
|
|
|
|
cache_path = os.path.join(CACHE_PATH, hash_url) |
|
|
|
|
|
if not os.path.exists(cache_path): |
|
|
|
|
|
os.makedirs(cache_path) |
|
|
|
|
|
# Caching a markdown file. |
|
|
|
|
|
template = environment.get_template("cache_article.md") |
|
|
|
|
|
page = template.render(title=title, content=content, url=url, hash_url=hash_url) |
|
|
|
|
|
result_path = os.path.join(cache_path, "index.md") |
|
|
|
|
|
open(result_path, "w").write(page) |
|
|
|
|
|
# Generating the HTML file. |
|
|
|
|
|
create(hash_url) |
|
|
|
|
|
md_line = f"> <cite>*[{title}]({url})* ([cache]({url_cache}))</cite>" |
|
|
|
|
|
print(md_line) |
|
|
|
|
|
os.popen(f'subl "{result_path}"') |
|
|
|
|
|
return md_line |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@wrap |
|
|
|
|
|
def perf_wrapper(): |
|
|
|
|
|
start = perf_counter() |
|
|
|
|
|
yield |
|
|
|
|
|
elapsed = perf_counter() - start |
|
|
|
|
|
print(f"Done in {elapsed:.5f} seconds.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
run() |
|
|
|