Repository with sources and generator of https://larlet.fr/david/ https://larlet.fr/david/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import re
  2. from dataclasses import dataclass
  3. import regex # for the support of "\p{}"
  4. @dataclass
  5. class Charactere:
  6. unicode: str
  7. html: str
  8. NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
  9. # TODISCUSS: part of the configuration?
  10. # Complete list:
  11. # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
  12. # Intentionnaly excluded: address, article, aside, blockquote,
  13. # dialog, div, dl, fieldset, figure, footer, form, hgroup, hr, main,
  14. # nav, ol, pre, section, table, ul.
  15. COMPUTED_BLOCKS = "|".join(["details", "dt", "dd", "figcaption", "h[1-6]", "li", "p"])
  16. # Complete list:
  17. # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
  18. # TODISCUSS: special treatment for `abbr` and `acronym`?
  19. # Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist,
  20. # embed, iframe, img, input, kbd, map, meter, noscript, object, picture,
  21. # progress, ruby, samp, script, select, slot, svg, template, textarea, time,
  22. # tt, var, video, wbr.
  23. ALLOWED_INLINES = "|".join(
  24. [
  25. "a",
  26. "b",
  27. "big",
  28. "cite",
  29. "data",
  30. "del",
  31. "dfn",
  32. "em",
  33. "i",
  34. "ins",
  35. "label",
  36. "mark",
  37. "output",
  38. "q",
  39. "s",
  40. "small",
  41. "span",
  42. "strong",
  43. "sub",
  44. "summary",
  45. "sup",
  46. "u",
  47. ]
  48. )
  49. # \p{} allows to match a character by its Unicode category
  50. # "Zs" is the category "Separator, space".
  51. widont_finder = regex.compile(
  52. rf"""(
  53. # must be preceded by:
  54. (?:</? # a closing tag
  55. (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
  56. |[^<>\s\.,;:!\?] # or a nontag/nonspace and not punctuation
  57. )
  58. \p{{Zs}}+ # the space to replace
  59. ([^<>\s]+ # must be followed by non-tag non-space characters
  60. \s* # optional white space!
  61. (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each
  62. ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string
  63. """,
  64. re.VERBOSE,
  65. )
  66. def widont(text, html=False):
  67. """Replaces the space between the last two words with non-breakable space
  68. Works in block tags from COMPUTED_BLOCKS and also accounts for
  69. potential closing inline elements in ALLOWED_INLINES.
  70. Adapted from:
  71. https://github.com/mintchaos/typogrify/blob/
  72. 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
  73. Improvements:
  74. * allow the unbreakable unicode character (new option)
  75. * consider the support of all pertinent inline and block elements
  76. * avoid insertion of a non-breakable space if the sentence is a single word
  77. * externalized test suite
  78. """
  79. spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
  80. output = widont_finder.sub(rf"\1{spacer}\2", text)
  81. return output