123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- import re
- from dataclasses import dataclass
-
- import regex # for the support of "\p{}"
-
-
- @dataclass
- class Charactere:
- unicode: str
- html: str
-
-
- NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ")
-
- # TODISCUSS: part of the configuration?
-
- # Complete list:
- # https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
- # Intentionnaly excluded: address, article, aside, blockquote,
- # dialog, div, dl, fieldset, figure, footer, form, hgroup, hr, main,
- # nav, ol, pre, section, table, ul.
- COMPUTED_BLOCKS = "|".join(["details", "dt", "dd", "figcaption", "h[1-6]", "li", "p"])
-
- # Complete list:
- # https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
- # TODISCUSS: special treatment for `abbr` and `acronym`?
- # Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist,
- # embed, iframe, img, input, kbd, map, meter, noscript, object, picture,
- # progress, ruby, samp, script, select, slot, svg, template, textarea, time,
- # tt, var, video, wbr.
- ALLOWED_INLINES = "|".join(
- [
- "a",
- "b",
- "big",
- "cite",
- "data",
- "del",
- "dfn",
- "em",
- "i",
- "ins",
- "label",
- "mark",
- "output",
- "q",
- "s",
- "small",
- "span",
- "strong",
- "sub",
- "summary",
- "sup",
- "u",
- ]
- )
-
- # \p{} allows to match a character by its Unicode category
- # "Zs" is the category "Separator, space".
- widont_finder = regex.compile(
- rf"""(
- # must be preceded by:
- (?:</? # a closing tag
- (?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
- |[^<>\s\.,;:!\?] # or a nontag/nonspace and not punctuation
- )
- \p{{Zs}}+ # the space to replace
- ([^<>\s]+ # must be followed by non-tag non-space characters
- \s* # optional white space!
- (</({ALLOWED_INLINES})>\s*)* # optional closing inline tags
- # with optional white space after each
- ((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of
- # the string
- """,
- re.VERBOSE,
- )
-
-
- def widont(text, html=False):
- """Replaces the space between the last two words with non-breakable space
-
- Works in block tags from COMPUTED_BLOCKS and also accounts for
- potential closing inline elements in ALLOWED_INLINES.
-
- Adapted from:
- https://github.com/mintchaos/typogrify/blob/
- 20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368
-
- Improvements:
-
- * allow the unbreakable unicode character (new option)
- * consider the support of all pertinent inline and block elements
- * avoid insertion of a non-breakable space if the sentence is a single word
- * externalized test suite
- """
- spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
- output = widont_finder.sub(rf"\1{spacer}\2", text)
-
- return output
|