Browse Source

Indexing contents from previous years, trade-offs

master
David Larlet 2 years ago
parent
commit
d406f1e401
5 changed files with 2931 additions and 14 deletions
  1. 1
    1
      david/index.html
  2. 2791
    1
      david/recherche/index.html
  3. 1
    1
      david/templates/profil.html
  4. 1
    1
      david/templates/recherche.html
  5. 137
    10
      site.py

+ 1
- 1
david/index.html View File



<h2>Recherche</h2> <h2>Recherche</h2>


<p>Les contenus de ces dernières années sont indexés.</p>
<p>Les contenus de ces dernières années sont indexés.</p>


<form action="/david/recherche/" method="get"> <form action="/david/recherche/" method="get">
<label for="input-search">Termes de votre recherche :</label> <label for="input-search">Termes de votre recherche :</label>

+ 2791
- 1
david/recherche/index.html
File diff suppressed because it is too large
View File


+ 1
- 1
david/templates/profil.html View File



<h2>Recherche</h2> <h2>Recherche</h2>


<p>Les contenus de ces dernières années sont indexés.</p>
<p>Les contenus de ces dernières années sont indexés.</p>


<form action="/david/recherche/" method="get"> <form action="/david/recherche/" method="get">
<label for="input-search">Termes de votre recherche :</label> <label for="input-search">Termes de votre recherche :</label>

+ 1
- 1
david/templates/recherche.html View File

` `
}).join('') }).join('')
} else { } else {
searchStatus.innerHTML = '<p>Aucune publication n’a été trouvée 😢<br>Seuls les écrits des 3 dernières années sont indexés.</p>'
searchStatus.innerHTML = '<p>Aucune publication n’a été trouvée 😢<br>Seuls les écrits de ces dernières années sont indexés.</p>'
resultList.innerHTML = '' resultList.innerHTML = ''
} }
} }

+ 137
- 10
site.py View File

#!/usr/bin/env python3 #!/usr/bin/env python3
import fnmatch
import json import json
import locale import locale
import os
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime, timedelta from datetime import datetime, timedelta
yield (previous, current, last) yield (previous, current, last)




def each_markdown_from(source_dir, file_name="*.md"):
"""Walk across the `source_dir` and return the md file paths."""
for filename in fnmatch.filter(os.listdir(source_dir), file_name):
yield filename
def each_file_from(source_dir, pattern="*", exclude=None):
"""Walk across the `source_dir` and return the `pattern` file paths."""
for path in _each_path_from(source_dir, pattern=pattern, exclude=exclude):
if path.is_file():
yield path


def each_folder_from(source_dir, exclude=None):
"""Walk across the `source_dir` and return the folder paths."""
for path in _each_path_from(source_dir, exclude=exclude):
if path.is_dir():
yield path


def _each_path_from(source_dir, pattern="*", exclude=None):
for path in sorted(Path(source_dir).glob(pattern)):
if exclude is not None and path.name in exclude:
continue
yield path




@dataclass @dataclass
"""Retrieve all pages sorted by desc.""" """Retrieve all pages sorted by desc."""
page_list = [] page_list = []
md = markdown_with_h2_anchors if with_h2_anchors else markdown_with_img_sizes md = markdown_with_h2_anchors if with_h2_anchors else markdown_with_img_sizes
for file_name in sorted(each_markdown_from(source)):
result = md.read(source / file_name)
for file_path in sorted(each_file_from(source, pattern="*.md")):
result = md.read(file_path)
result = widont(result, html=True) result = widont(result, html=True)
# Extract (and remove) the title from the generated page. # Extract (and remove) the title from the generated page.
title, content = result.split("</h1>", 1) title, content = result.split("</h1>", 1)
except IndexError: except IndexError:
# It happens for old contents, parsed for the search index. # It happens for old contents, parsed for the search index.
pass pass
page = Page(title, content, tags, file_name)
page = Page(title, content, tags, file_path.name)
pages_by_url[page.url] = page pages_by_url[page.url] = page
if not page.is_draft: if not page.is_draft:
all_tags.update(tags) all_tags.update(tags)
page_list_2020 = Page.all( page_list_2020 = Page.all(
source=DAVID / "2020", only_published=True, with_h2_anchors=False source=DAVID / "2020", only_published=True, with_h2_anchors=False
) )
page_list = page_list_2022 + page_list_2021 + page_list_2020
blog_page_list_2019 = BlogPage.all(source=DAVID / "blog" / "2019")
blog_page_list_2018 = BlogPage.all(source=DAVID / "blog" / "2018")
blog_page_list_2017 = BlogPage.all(source=DAVID / "blog" / "2017")
stream_page_list_2019 = StreamPage.all(source=DAVID / "stream" / "2019")
stream_page_list_2018 = StreamPage.all(source=DAVID / "stream" / "2018")
page_list = (
page_list_2022
+ page_list_2021
+ page_list_2020
+ blog_page_list_2019
+ blog_page_list_2018
+ blog_page_list_2017
+ stream_page_list_2019
+ stream_page_list_2018
)
search_index = json.dumps([page.search_data for page in page_list], indent=2) search_index = json.dumps([page.search_data for page in page_list], indent=2)
content = template.render(search_index=search_index) content = template.render(search_index=search_index)
open(DAVID / "recherche" / "index.html", "w").write(content) open(DAVID / "recherche" / "index.html", "w").write(content)
print(f"Done in {elapsed:.5f} seconds.") print(f"Done in {elapsed:.5f} seconds.")




# Below are legacy blog contents, still useful for search indexation.
@dataclass
class BlogPage:
title: str
content: str
file_path: str
date_str: str

def __post_init__(self):
self.date = datetime.strptime(self.date_str, "%Y-%m-%d").date()
self.url = f"/{self.file_path}/"
# Create the index for the search.
self.search_data = {
"title": self.title,
"url": self.url,
"date": self.date_str,
"content": do_striptags(self.content)
.replace("\u00a0(cache)", " ")
.replace("'", " ")
.replace("<", "&lt;")
.replace(">", "&gt;"),
}

def __eq__(self, other):
return self.url == other.url

def __lt__(self, other: "BlogPage"):
if not isinstance(other, self.__class__):
return NotImplemented
return self.date < other.date

@staticmethod
def all(source: Path):
"""Retrieve all pages sorted by desc."""
page_list = []
for folder in each_folder_from(source):
for path in each_file_from(folder, pattern="*.md"):
metadata, content = path.read_text().split("\n\n", 1)
if "lang:" in metadata:
title, slug, date_, chapo, lang = metadata.split("\n")
else:
title, slug, date_, chapo = metadata.split("\n")
title = title[len("title: ") :].strip()
date_str = date_[len("date: ") :].strip()
content = markdown_with_img_sizes(content)
page = BlogPage(title, content, path.parent, date_str)
page_list.append(page)
return sorted(page_list, reverse=True)


@dataclass
class StreamPage:
title: str
content: str
file_path: str
date_str: str

def __post_init__(self):
self.date = datetime.strptime(self.date_str, "%Y/%m/%d").date()
self.url = f"/{self.file_path}/"
# Create the index for the search.
self.search_data = {
"title": self.title,
"url": self.url,
"date": self.date.isoformat(),
"content": do_striptags(self.content)
.replace("\u00a0(cache)", " ")
.replace("'", " ")
.replace("<", "&lt;")
.replace(">", "&gt;"),
}

def __eq__(self, other):
return self.url == other.url

def __lt__(self, other: "StreamPage"):
if not isinstance(other, self.__class__):
return NotImplemented
return self.date < other.date

@staticmethod
def all(source: Path):
"""Retrieve all pages sorted by desc."""
page_list = []
for folder in each_folder_from(source):
for subfolder in each_folder_from(folder):
for path in each_file_from(subfolder, pattern="*.md"):
metadata, content = path.read_text().split("\n\n", 1)
if "lang:" in metadata:
title, lang = metadata.split("\n")
else:
title = metadata.strip()
title = title[len("title: ") :].strip()
date_str = str(path.parent)[-len("YYYY/MM/DD") :]
content = markdown_with_img_sizes(content)
page = StreamPage(title, content, path.parent, date_str)
page_list.append(page)
return sorted(page_list, reverse=True)


if __name__ == "__main__": if __name__ == "__main__":
run() run()

Loading…
Cancel
Save