davidbgk
/
larlet-fr-david


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
							import re
from dataclasses import dataclass

import regex  # for the support of "\p{}"


@dataclass
class Charactere:
    unicode: str
    html: str


NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html="&nbsp;")

# TODISCUSS: part of the configuration?
COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"])

# \p{} allows to match a character by its Unicode category
# "Zs" is the category "Separator, space".
widont_finder = regex.compile(
    rf"""(
        # must be preceded by:
        (?:</?                            # a closing tag
            (?:{ALLOWED_INLINES})[^>]*>)  # an approved inline opening
            |[^<>\s\.]                    # or a nontag/nonspace and not a point
        )
        \p{{Zs}}+                      # the space to replace
        ([^<>\s]+                      # must be followed by non-tag non-space characters
        \s*                            # optional white space!
        (</({ALLOWED_INLINES})>\s*)*   # optional closing inline tags with optional white space after each
        ((</({COMPUTED_BLOCKS})>)|$))  # end with a closing block or the end of the string
    """,
    re.VERBOSE,
)


def widont(text, html=False):
    """Replaces the space between the last two words with non-breakable space

    Works in block tags from COMPUTED_BLOCKS and also accounts for
    potential closing inline elements in ALLOWED_INLINES.

    Adapted from:
    https://github.com/mintchaos/typogrify/blob/
    20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368

    Improvements:

    * allow the unbreakable unicode character (new option)
    * add the mark element as a potential inline
    * avoid insertion of a non-breakable space if the sentence is a single word
    * externalized test suite
    """
    spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
    output = widont_finder.sub(rf"\1{spacer}\2", text)

    return output