Browse Source

Indexing contents from previous years, trade-offs

master
David Larlet 8 months ago
parent
commit
d406f1e401
5 changed files with 2931 additions and 14 deletions
  1. 1
    1
      david/index.html
  2. 2791
    1
      david/recherche/index.html
  3. 1
    1
      david/templates/profil.html
  4. 1
    1
      david/templates/recherche.html
  5. 137
    10
      site.py

+ 1
- 1
david/index.html View File

@@ -192,7 +192,7 @@

<h2>Recherche</h2>

<p>Les contenus de ces dernières années sont indexés.</p>
<p>Les contenus de ces dernières années sont indexés.</p>

<form action="/david/recherche/" method="get">
<label for="input-search">Termes de votre recherche :</label>

+ 2791
- 1
david/recherche/index.html
File diff suppressed because it is too large
View File


+ 1
- 1
david/templates/profil.html View File

@@ -141,7 +141,7 @@

<h2>Recherche</h2>

<p>Les contenus de ces dernières années sont indexés.</p>
<p>Les contenus de ces dernières années sont indexés.</p>

<form action="/david/recherche/" method="get">
<label for="input-search">Termes de votre recherche :</label>

+ 1
- 1
david/templates/recherche.html View File

@@ -263,7 +263,7 @@
`
}).join('')
} else {
searchStatus.innerHTML = '<p>Aucune publication n’a été trouvée 😢<br>Seuls les écrits des 3 dernières années sont indexés.</p>'
searchStatus.innerHTML = '<p>Aucune publication n’a été trouvée 😢<br>Seuls les écrits de ces dernières années sont indexés.</p>'
resultList.innerHTML = ''
}
}

+ 137
- 10
site.py View File

@@ -1,8 +1,6 @@
#!/usr/bin/env python3
import fnmatch
import json
import locale
import os
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
@@ -239,10 +237,25 @@ def neighborhood(iterable, first=None, last=None):
yield (previous, current, last)


def each_markdown_from(source_dir, file_name="*.md"):
"""Walk across the `source_dir` and return the md file paths."""
for filename in fnmatch.filter(os.listdir(source_dir), file_name):
yield filename
def each_file_from(source_dir, pattern="*", exclude=None):
"""Walk across the `source_dir` and return the `pattern` file paths."""
for path in _each_path_from(source_dir, pattern=pattern, exclude=exclude):
if path.is_file():
yield path


def each_folder_from(source_dir, exclude=None):
"""Walk across the `source_dir` and return the folder paths."""
for path in _each_path_from(source_dir, exclude=exclude):
if path.is_dir():
yield path


def _each_path_from(source_dir, pattern="*", exclude=None):
for path in sorted(Path(source_dir).glob(pattern)):
if exclude is not None and path.name in exclude:
continue
yield path


@dataclass
@@ -305,8 +318,8 @@ class Page:
"""Retrieve all pages sorted by desc."""
page_list = []
md = markdown_with_h2_anchors if with_h2_anchors else markdown_with_img_sizes
for file_name in sorted(each_markdown_from(source)):
result = md.read(source / file_name)
for file_path in sorted(each_file_from(source, pattern="*.md")):
result = md.read(file_path)
result = widont(result, html=True)
# Extract (and remove) the title from the generated page.
title, content = result.split("</h1>", 1)
@@ -330,7 +343,7 @@ class Page:
except IndexError:
# It happens for old contents, parsed for the search index.
pass
page = Page(title, content, tags, file_name)
page = Page(title, content, tags, file_path.name)
pages_by_url[page.url] = page
if not page.is_draft:
all_tags.update(tags)
@@ -420,7 +433,21 @@ def search():
page_list_2020 = Page.all(
source=DAVID / "2020", only_published=True, with_h2_anchors=False
)
page_list = page_list_2022 + page_list_2021 + page_list_2020
blog_page_list_2019 = BlogPage.all(source=DAVID / "blog" / "2019")
blog_page_list_2018 = BlogPage.all(source=DAVID / "blog" / "2018")
blog_page_list_2017 = BlogPage.all(source=DAVID / "blog" / "2017")
stream_page_list_2019 = StreamPage.all(source=DAVID / "stream" / "2019")
stream_page_list_2018 = StreamPage.all(source=DAVID / "stream" / "2018")
page_list = (
page_list_2022
+ page_list_2021
+ page_list_2020
+ blog_page_list_2019
+ blog_page_list_2018
+ blog_page_list_2017
+ stream_page_list_2019
+ stream_page_list_2018
)
search_index = json.dumps([page.search_data for page in page_list], indent=2)
content = template.render(search_index=search_index)
open(DAVID / "recherche" / "index.html", "w").write(content)
@@ -447,5 +474,105 @@ def perf_wrapper():
print(f"Done in {elapsed:.5f} seconds.")


# Below are legacy blog contents, still useful for search indexation.
@dataclass
class BlogPage:
title: str
content: str
file_path: str
date_str: str

def __post_init__(self):
self.date = datetime.strptime(self.date_str, "%Y-%m-%d").date()
self.url = f"/{self.file_path}/"
# Create the index for the search.
self.search_data = {
"title": self.title,
"url": self.url,
"date": self.date_str,
"content": do_striptags(self.content)
.replace("\u00a0(cache)", " ")
.replace("'", " ")
.replace("<", "&lt;")
.replace(">", "&gt;"),
}

def __eq__(self, other):
return self.url == other.url

def __lt__(self, other: "BlogPage"):
if not isinstance(other, self.__class__):
return NotImplemented
return self.date < other.date

@staticmethod
def all(source: Path):
"""Retrieve all pages sorted by desc."""
page_list = []
for folder in each_folder_from(source):
for path in each_file_from(folder, pattern="*.md"):
metadata, content = path.read_text().split("\n\n", 1)
if "lang:" in metadata:
title, slug, date_, chapo, lang = metadata.split("\n")
else:
title, slug, date_, chapo = metadata.split("\n")
title = title[len("title: ") :].strip()
date_str = date_[len("date: ") :].strip()
content = markdown_with_img_sizes(content)
page = BlogPage(title, content, path.parent, date_str)
page_list.append(page)
return sorted(page_list, reverse=True)


@dataclass
class StreamPage:
title: str
content: str
file_path: str
date_str: str

def __post_init__(self):
self.date = datetime.strptime(self.date_str, "%Y/%m/%d").date()
self.url = f"/{self.file_path}/"
# Create the index for the search.
self.search_data = {
"title": self.title,
"url": self.url,
"date": self.date.isoformat(),
"content": do_striptags(self.content)
.replace("\u00a0(cache)", " ")
.replace("'", " ")
.replace("<", "&lt;")
.replace(">", "&gt;"),
}

def __eq__(self, other):
return self.url == other.url

def __lt__(self, other: "StreamPage"):
if not isinstance(other, self.__class__):
return NotImplemented
return self.date < other.date

@staticmethod
def all(source: Path):
"""Retrieve all pages sorted by desc."""
page_list = []
for folder in each_folder_from(source):
for subfolder in each_folder_from(folder):
for path in each_file_from(subfolder, pattern="*.md"):
metadata, content = path.read_text().split("\n\n", 1)
if "lang:" in metadata:
title, lang = metadata.split("\n")
else:
title = metadata.strip()
title = title[len("title: ") :].strip()
date_str = str(path.parent)[-len("YYYY/MM/DD") :]
content = markdown_with_img_sizes(content)
page = StreamPage(title, content, path.parent, date_str)
page_list.append(page)
return sorted(page_list, reverse=True)


if __name__ == "__main__":
run()

Loading…
Cancel
Save