and spaces. content = content[5:-6].strip() return title, content def create(hash_url): """Turn new MD file into HTML file.""" template = environment.get_template("cache_article.html") cache = Cache.one(hash_url) page = template.render(cache=cache) cache_target = CACHE_PATH / hash_url if not os.path.exists(cache_target): os.makedirs(cache_target) open(cache_target / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/") def save( title, content, url, hash_url, archive_date, cache_path, og_image, description ): template = environment.get_template("cache_article.md") page = template.render( title=title, content=content, url=url, hash_url=hash_url, archive_date=archive_date, og_image=og_image, description=description, ) result_path = os.path.join(cache_path, "index.md") open(result_path, "w").write(page) return result_path @cli def generate(): """Generate caches MD files into HTML files.""" cache_list = [] template = environment.get_template("cache_article.html") for cache in Cache.all(): page = template.render(cache=cache) open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) cache_list.append(cache) template = environment.get_template("cache_archives.html") page = template.render(cache_list=cache_list) open(CACHE_PATH / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/") def fetch_metadata(title, url, description): """Fetch additional metadata.""" data = lassie.fetch(url) og_image = "" for image in data.get("images"): if image.get("type") == "og:image": og_image = image["src"] break if data.get("title") != title: print(data.get("title"), "vs.", title, url) description = description or data.get("description", "") return og_image, description @cli def metadata(): """Fetch additional metadata for existing archives.""" for cache in Cache.all(): if cache.description or cache.url.startswith( ( "https://www.la-grange.net", "https://tw5.immateriel.fr", "https://gilest.org", "https://vasilis.nl", "https://www.danmcquillan.org", "https://public-inbox.org", ) ): print("Skipping", cache.url) continue print("Fetching metadata for", cache.url, cache.title) og_image, description = fetch_metadata( cache.title, cache.url, cache.description ) save( cache.title, cache.content, cache.url, cache.hash_url, cache.archive_date, os.path.join(CACHE_PATH, cache.hash_url), og_image, description, ) @cli def new(url): """Turn the given URL into a MD and a HTML files. :url: The URL of the page to put into cache. """ hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() try: title, content = extract_page(url) except ( lxml.etree.XMLSyntaxError, httpx.HTTPError, httpx.ReadTimeout, ) as e: print(f"WARNING: {e}") title, content = "", "" cache_path = os.path.join(CACHE_PATH, hash_url) if not os.path.exists(cache_path): os.makedirs(cache_path) archive_date = date.today() # Caching a markdown file. og_image, description = fetch_metadata(title, url, "") result_path = save( title, content, url, hash_url, archive_date, cache_path, og_image, description ) # Generating the HTML file. create(hash_url) md_line = f"> *[{title}]({url})*" print(md_line) os.popen(f'subl "{result_path}"') @wrap def perf_wrapper(): start = perf_counter() yield elapsed = perf_counter() - start print(f"Done in {elapsed:.5f} seconds.") if __name__ == "__main__": run()