12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- import re
- from dataclasses import dataclass
-
- import regex # for the support of "\p{}"
-
-
- @dataclass
- class Charactere:
- unicode: str
- html: str
-
-
- NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
-
- # TODISCUSS: part of the configuration?
- COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
- ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"])
-
- # \p{} allows to match a character by its Unicode category
- # "Zs" is the category "Separator, space".
- widont_finder = regex.compile(
- rf"""(
- # must be preceded by:
- (?:</? # a closing tag
- (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
- |[^<>\s\.] # or a nontag/nonspace and not a point
- )
- \p{{Zs}}+ # the space to replace
- ([^<>\s]+ # must be followed by non-tag non-space characters
- \s* # optional white space!
- (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each
- ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string
- """,
- re.VERBOSE,
- )
-
-
- def widont(text, html=False):
- """Replaces the space between the last two words with non-breakable space
-
- Works in block tags from COMPUTED_BLOCKS and also accounts for
- potential closing inline elements in ALLOWED_INLINES.
-
- Adapted from:
- https://github.com/mintchaos/typogrify/blob/
- 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
-
- Improvements:
-
- * allow the unbreakable unicode character (new option)
- * add the mark element as a potential inline
- * avoid insertion of a non-breakable space if the sentence is a single word
- * externalized test suite
- """
- spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
- output = widont_finder.sub(rf"\1{spacer}\2", text)
-
- return output
|