import re from dataclasses import dataclass import regex # for the support of "\p{}" @dataclass class Charactere: unicode: str html: str NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ") # TODISCUSS: part of the configuration? COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"]) ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"]) # \p{} allows to match a character by its Unicode category # "Zs" is the category "Separator, space". widont_finder = regex.compile( rf"""( # must be preceded by: (?:]*>) # an approved inline opening |[^<>\s\.] # or a nontag/nonspace and not a point ) \p{{Zs}}+ # the space to replace ([^<>\s]+ # must be followed by non-tag non-space characters \s* # optional white space! (\s*)* # optional closing inline tags with optional white space after each (()|$)) # end with a closing block or the end of the string """, re.VERBOSE, ) def widont(text, html=False): """Replaces the space between the last two words with non-breakable space Works in block tags from COMPUTED_BLOCKS and also accounts for potential closing inline elements in ALLOWED_INLINES. Adapted from: https://github.com/mintchaos/typogrify/blob/ 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368 Improvements: * allow the unbreakable unicode character (new option) * add the mark element as a potential inline * avoid insertion of a non-breakable space if the sentence is a single word * externalized test suite """ spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode output = widont_finder.sub(rf"\1{spacer}\2", text) return output