Repository with sources and generator of https://larlet.fr/david/ https://larlet.fr/david/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

widont.py 2.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import re
  2. from dataclasses import dataclass
  3. import regex # for the support of "\p{}"
  4. @dataclass
  5. class Charactere:
  6. unicode: str
  7. html: str
  8. NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
  9. # TODISCUSS: part of the configuration?
  10. COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
  11. # Complete list:
  12. # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
  13. # TODISCUSS: special treatment for `abbr` and `acronym`?
  14. # Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist,
  15. # embed, iframe, img, input, kbd, map, meter, noscript, object, picture,
  16. # progress, ruby, samp, script, select, slot, svg, template, textarea, time,
  17. # tt, var, video, wbr.
  18. ALLOWED_INLINES = "|".join(
  19. [
  20. "a",
  21. "b",
  22. "big",
  23. "cite",
  24. "data",
  25. "del",
  26. "dfn",
  27. "em",
  28. "i",
  29. "ins",
  30. "label",
  31. "mark",
  32. "output",
  33. "q",
  34. "s",
  35. "small",
  36. "span",
  37. "strong",
  38. "sub",
  39. "sup",
  40. "u",
  41. ]
  42. )
  43. # \p{} allows to match a character by its Unicode category
  44. # "Zs" is the category "Separator, space".
  45. widont_finder = regex.compile(
  46. rf"""(
  47. # must be preceded by:
  48. (?:</? # a closing tag
  49. (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
  50. |[^<>\s\.,;:!\?] # or a nontag/nonspace and not punctuation
  51. )
  52. \p{{Zs}}+ # the space to replace
  53. ([^<>\s]+ # must be followed by non-tag non-space characters
  54. \s* # optional white space!
  55. (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each
  56. ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string
  57. """,
  58. re.VERBOSE,
  59. )
  60. def widont(text, html=False):
  61. """Replaces the space between the last two words with non-breakable space
  62. Works in block tags from COMPUTED_BLOCKS and also accounts for
  63. potential closing inline elements in ALLOWED_INLINES.
  64. Adapted from:
  65. https://github.com/mintchaos/typogrify/blob/
  66. 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
  67. Improvements:
  68. * allow the unbreakable unicode character (new option)
  69. * add the mark element as a potential inline
  70. * avoid insertion of a non-breakable space if the sentence is a single word
  71. * externalized test suite
  72. """
  73. spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
  74. output = widont_finder.sub(rf"\1{spacer}\2", text)
  75. return output