and spaces. content = content[5:-6].strip() return title, content def create(hash_url): """Turn new MD file into HTML file.""" template = environment.get_template("cache_article.html") cache = Cache.one(hash_url) page = template.render(cache=cache) cache_target = CACHE_PATH / hash_url if not os.path.exists(cache_target): os.makedirs(cache_target) open(cache_target / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/{hash_url}/") def save( title, content, url, hash_url, archive_date, cache_path, og_image, description, favicon, language, ): template = environment.get_template("cache_article.md") page = template.render( title=title, content=content, url=url, hash_url=hash_url, archive_date=archive_date, og_image=og_image, description=description, favicon=favicon, language=language, ) result_path = os.path.join(cache_path, "index.md") open(result_path, "w").write(page) return result_path @cli def generate(): """Generate caches MD files into HTML files.""" cache_list = [] template = environment.get_template("cache_article.html") for cache in Cache.all(): page = template.render(cache=cache) open(CACHE_PATH / cache.hash_url / "index.html", "w").write(page) cache_list.append(cache) template = environment.get_template("cache_archives.html") page = template.render(cache_list=cache_list) open(CACHE_PATH / "index.html", "w").write(page) print(f"Done: {LOCAL_DOMAIN}/david/cache/{YEAR}/") def fetch_metadata(title, url, description, language): """Fetch additional metadata.""" parsed_url = urlparse(url) root_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" data = lassie.fetch(url, all_images=True) og_image = "" favicon = "" for image in data.get("images"): image_type = image.get("type") image_src = image.get("src") if image_src == root_url: og_image = "" continue elif image_type == "og:image" and not og_image: og_image = image_src continue elif image_type == "twitter:image" and not og_image: og_image = image_src continue elif image_type == "favicon": if not favicon: favicon = image_src elif ".ico" in favicon and ".ico" not in image_src: favicon = image_src if not og_image: for image in data.get("images"): if image_type == "body_image": image_src = image.get("src") if image_src and "favicon" not in image_src: og_image = image_src break # Fallback on server's default. if not favicon: favico_url = f"{root_url}favicon.ico" response = httpx.get(favico_url) if response.status_code == 200: favicon = favico_url if data.get("title") != title: print(data.get("title"), "vs.", title, url) description = description or data.get("description", "") language = language or data.get("locale", "") return og_image, description, favicon, language @cli def metadata(): """Fetch additional metadata for existing archives.""" for cache in Cache.all(): # That one is taking way too long. if cache.url.startswith("https://tw5.immateriel.fr"): print("Skipping (too long)", cache.url) continue if cache.og_image and cache.description and cache.favicon and cache.language: print("Skipping (all good)", cache.url) continue if cache.url.startswith( ( "https://www.la-grange.net", "https://gilest.org", "https://vasilis.nl", "https://www.danmcquillan.org", "https://public-inbox.org", ) ) and (cache.og_image or cache.description or cache.favicon or cache.language): print("Skipping (known missing infos)", cache.url) continue print("Fetching metadata for", cache.url, cache.title) og_image, description, favicon, language = fetch_metadata( cache.title, cache.url, cache.description, cache.language ) save( cache.title, cache.content, cache.url, cache.hash_url, cache.archive_date, os.path.join(CACHE_PATH, cache.hash_url), og_image, description, favicon, language, ) @cli def new(url): """Turn the given URL into a MD and a HTML files. :url: The URL of the page to put into cache. """ hash_url = hashlib.md5(url.encode("utf-8")).hexdigest() try: title, content = extract_page(url) except ( lxml.etree.XMLSyntaxError, httpx.HTTPError, httpx.ReadTimeout, ) as e: print(f"WARNING: {e}") title, content = "", "" cache_path = os.path.join(CACHE_PATH, hash_url) if not os.path.exists(cache_path): os.makedirs(cache_path) archive_date = date.today() # Caching a markdown file. og_image, description, favicon, language = fetch_metadata(title, url, "", "") result_path = save( title, content, url, hash_url, archive_date, cache_path, og_image, description, favicon, language, ) # Generating the HTML file. create(hash_url) md_line = f"> *[{title}]({url})*" print(md_line) os.popen(f'subl "{result_path}"') @wrap def perf_wrapper(): start = perf_counter() yield elapsed = perf_counter() - start print(f"Done in {elapsed:.5f} seconds.") if __name__ == "__main__": run()