Repository with sources and generator of https://larlet.fr/david/ https://larlet.fr/david/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

widont.py 3.0KB

2 weeks ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. import re
  2. from dataclasses import dataclass
  3. import regex # for the support of "\p{}"
  4. @dataclass
  5. class Charactere:
  6. unicode: str
  7. html: str
  8. NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
  9. # TODISCUSS: part of the configuration?
  10. # Complete list:
  11. # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
  12. # Intentionnaly excluded: address, article, aside, blockquote,
  13. # dialog, div, dl, fieldset, figure, footer, form, hgroup, hr, main,
  14. # nav, ol, pre, section, table, ul.
  15. COMPUTED_BLOCKS = "|".join(["details", "dt", "dd", "figcaption", "h[1-6]", "li", "p"])
  16. # Complete list:
  17. # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
  18. # TODISCUSS: special treatment for `abbr` and `acronym`?
  19. # Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist,
  20. # embed, iframe, img, input, kbd, map, meter, noscript, object, picture,
  21. # progress, ruby, samp, script, select, slot, svg, template, textarea, time,
  22. # tt, var, video, wbr.
  23. ALLOWED_INLINES = "|".join(
  24. [
  25. "a",
  26. "b",
  27. "big",
  28. "cite",
  29. "data",
  30. "del",
  31. "dfn",
  32. "em",
  33. "i",
  34. "ins",
  35. "label",
  36. "mark",
  37. "output",
  38. "q",
  39. "s",
  40. "small",
  41. "span",
  42. "strong",
  43. "sub",
  44. "summary",
  45. "sup",
  46. "u",
  47. ]
  48. )
  49. # \p{} allows to match a character by its Unicode category
  50. # "Zs" is the category "Separator, space".
  51. widont_finder = regex.compile(
  52. rf"""(
  53. # must be preceded by:
  54. (?:</? # a closing tag
  55. (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
  56. |[^<>\s\.,;:!\?] # or a nontag/nonspace and not punctuation
  57. )
  58. \p{{Zs}}+ # the space to replace
  59. ([^<>\s]+ # must be followed by non-tag non-space characters
  60. \s* # optional white space!
  61. (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags
  62. # with optional white space after each
  63. ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of
  64. # the string
  65. """,
  66. re.VERBOSE,
  67. )
  68. def widont(text, html=False):
  69. """Replaces the space between the last two words with non-breakable space
  70. Works in block tags from COMPUTED_BLOCKS and also accounts for
  71. potential closing inline elements in ALLOWED_INLINES.
  72. Adapted from:
  73. https://github.com/mintchaos/typogrify/blob/
  74. 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
  75. Improvements:
  76. * allow the unbreakable unicode character (new option)
  77. * consider the support of all pertinent inline and block elements
  78. * avoid insertion of a non-breakable space if the sentence is a single word
  79. * externalized test suite
  80. """
  81. spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
  82. output = widont_finder.sub(rf"\1{spacer}\2", text)
  83. return output