Repository with sources and generator of https://larlet.fr/david/ https://larlet.fr/david/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

widont.py 2.0KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import re
  2. from dataclasses import dataclass
  3. import regex # for the support of "\p{}"
  4. @dataclass
  5. class Charactere:
  6. unicode: str
  7. html: str
  8. NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
  9. # TODISCUSS: part of the configuration?
  10. COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
  11. ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"])
  12. # \p{} allows to match a character by its Unicode category
  13. # "Zs" is the category "Separator, space".
  14. widont_finder = regex.compile(
  15. rf"""(
  16. # must be preceded by:
  17. (?:</? # a closing tag
  18. (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
  19. |[^<>\s\.] # or a nontag/nonspace and not a point
  20. )
  21. \p{{Zs}}+ # the space to replace
  22. ([^<>\s]+ # must be followed by non-tag non-space characters
  23. \s* # optional white space!
  24. (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each
  25. ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string
  26. """,
  27. re.VERBOSE,
  28. )
  29. def widont(text, html=False):
  30. """Replaces the space between the last two words with non-breakable space
  31. Works in block tags from COMPUTED_BLOCKS and also accounts for
  32. potential closing inline elements in ALLOWED_INLINES.
  33. Adapted from:
  34. https://github.com/mintchaos/typogrify/blob/
  35. 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
  36. Improvements:
  37. * allow the unbreakable unicode character (new option)
  38. * add the mark element as a potential inline
  39. * avoid insertion of a non-breakable space if the sentence is a single word
  40. * externalized test suite
  41. """
  42. spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
  43. output = widont_finder.sub(rf"\1{spacer}\2", text)
  44. return output