import re from dataclasses import dataclass import regex # for the support of "\p{}" @dataclass class Charactere: unicode: str html: str NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ") # TODISCUSS: part of the configuration? # Complete list: # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements # Intentionnaly excluded: address, article, aside, blockquote, # dialog, div, dl, fieldset, figure, footer, form, hgroup, hr, main, # nav, ol, pre, section, table, ul. COMPUTED_BLOCKS = "|".join(["details", "dt", "dd", "figcaption", "h[1-6]", "li", "p"]) # Complete list: # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements # TODISCUSS: special treatment for `abbr` and `acronym`? # Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist, # embed, iframe, img, input, kbd, map, meter, noscript, object, picture, # progress, ruby, samp, script, select, slot, svg, template, textarea, time, # tt, var, video, wbr. ALLOWED_INLINES = "|".join( [ "a", "b", "big", "cite", "data", "del", "dfn", "em", "i", "ins", "label", "mark", "output", "q", "s", "small", "span", "strong", "sub", "summary", "sup", "u", ] ) # \p{} allows to match a character by its Unicode category # "Zs" is the category "Separator, space". widont_finder = regex.compile( rf"""( # must be preceded by: (?:]*>) # an approved inline opening |[^<>\s\.,;:!\?] # or a nontag/nonspace and not punctuation ) \p{{Zs}}+ # the space to replace ([^<>\s]+ # must be followed by non-tag non-space characters \s* # optional white space! (\s*)* # optional closing inline tags # with optional white space after each (()|$)) # end with a closing block or the end of # the string """, re.VERBOSE, ) def widont(text, html=False): """Replaces the space between the last two words with non-breakable space Works in block tags from COMPUTED_BLOCKS and also accounts for potential closing inline elements in ALLOWED_INLINES. Adapted from: https://github.com/mintchaos/typogrify/blob/ 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368 Improvements: * allow the unbreakable unicode character (new option) * consider the support of all pertinent inline and block elements * avoid insertion of a non-breakable space if the sentence is a single word * externalized test suite """ spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode output = widont_finder.sub(rf"\1{spacer}\2", text) return output