Repository with sources and generator of https://larlet.fr/david/ https://larlet.fr/david/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

typography.py 2.7KB

3 年之前
3 年之前
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from dataclasses import dataclass
  2. import html.entities
  3. import unicodedata
  4. import regex # pour le support de "\p{}"
  5. @dataclass
  6. class Caractere:
  7. unicode: str
  8. html: str
  9. def __init__(self, name: str):
  10. self.unicode = unicodedata.lookup(name)
  11. codepoint = ord(self.unicode)
  12. html_name = html.entities.codepoint2name.get(codepoint, f"#{codepoint}")
  13. self.html = f"&{html_name};"
  14. ESPACE_INSECABLE = Caractere(name="NO-BREAK SPACE")
  15. ESPACE_FINE_INSECABLE = Caractere(name="NARROW NO-BREAK SPACE")
  16. def assemble_regexes(*regexes):
  17. return "|".join(regexes)
  18. def build_regex(avant, apres):
  19. # \p{} permet de reconnaître un caractère par sa catégorie Unicode
  20. # "Zs" est la catégorie "Separator, space".
  21. return (
  22. rf"((?P<avant>{avant})"
  23. + rf"(\p{{Zs}}|{ESPACE_INSECABLE.html})"
  24. + rf"(?P<apres>{apres}))"
  25. + r"(?!(.(?!<svg))*<\/svg>)"
  26. )
  27. RE_ESPACE_FINE_INSECABLE = regex.compile(
  28. assemble_regexes(
  29. build_regex(r"\w?", r"[;\?!]"), # Ponctuations doubles.
  30. build_regex(
  31. r"\d", r"([ghj]|min|sec|images|mm|hab|kg|mg|µg|L|km|°C|GHz)(\b|$)"
  32. ), # Unités.
  33. build_regex(r"\d", r"(Mo|Ko|Go|Mb|Kb|Gb)(\b|$)"), # Tailles de fichiers.
  34. build_regex(r"\d", r"%"), # Pourcentages.
  35. build_regex(r"\d", r"€"), # Symboles monétaires.
  36. build_regex(r"\d", r"\d"), # Séparateurs de milliers.
  37. )
  38. )
  39. def insere_espaces_fines_insecables(texte):
  40. return RE_ESPACE_FINE_INSECABLE.sub(
  41. r"\g<avant>" + ESPACE_FINE_INSECABLE.unicode + r"\g<apres>", texte
  42. )
  43. RE_ESPACE_INSECABLE = regex.compile(
  44. assemble_regexes(
  45. build_regex(r"\w?", r":"), # Deux points.
  46. build_regex(r"«", r""), # Guillemets en chevrons.
  47. build_regex(r"", r"»"), # Guillemets en chevrons.
  48. build_regex(
  49. rf"\b(\d|{ESPACE_FINE_INSECABLE.html})+", r"(?!\d)\w"
  50. ), # Nombre suivi de lettres.
  51. build_regex(r"(M\.|Mme)", r"\w"), # Titres (Monsieur, Madame).
  52. )
  53. )
  54. def insere_espaces_insecables(texte):
  55. return RE_ESPACE_INSECABLE.sub(
  56. r"\g<avant>" + ESPACE_INSECABLE.unicode + r"\g<apres>", texte
  57. )
  58. def encode_espaces_insecables_en_html(texte):
  59. for caractere in (ESPACE_INSECABLE, ESPACE_FINE_INSECABLE):
  60. texte = texte.replace(caractere.unicode, caractere.html)
  61. return texte
  62. def typographie(texte, html=False):
  63. """
  64. Utilise les espaces insécables fines ou normales lorsque c’est approprié
  65. https://fr.wikipedia.org/wiki/Espace_ins%C3%A9cable#En_France
  66. """
  67. res = insere_espaces_fines_insecables(insere_espaces_insecables(texte))
  68. if html:
  69. res = encode_espaces_insecables_en_html(res)
  70. return res