davidbgk
/
larlet-fr-david

import re
from dataclasses import dataclass

import regex  # for the support of "\p{}"


@dataclass
class Charactere:
    unicode: str
    html: str


NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html="&nbsp;")

# TODISCUSS: part of the configuration?

# Complete list:
# https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
# Intentionnaly excluded: address, article, aside, blockquote,
# dialog, div, dl, fieldset, figure, footer, form, hgroup, hr, main,
# nav, ol, pre, section, table, ul.
COMPUTED_BLOCKS = "|".join(["details", "dt", "dd", "figcaption", "h[1-6]", "li", "p"])

# Complete list:
# https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
# TODISCUSS: special treatment for `abbr` and `acronym`?
# Intentionnaly excluded: audio, bdi, bdo, br, button, canvas, code, datalist,
# embed, iframe, img, input, kbd, map, meter, noscript, object, picture,
# progress, ruby, samp, script, select, slot, svg, template, textarea, time,
# tt, var, video, wbr.
ALLOWED_INLINES = "|".join(
    [
        "a",
        "b",
        "big",
        "cite",
        "data",
        "del",
        "dfn",
        "em",
        "i",
        "ins",
        "label",
        "mark",
        "output",
        "q",
        "s",
        "small",
        "span",
        "strong",
        "sub",
        "summary",
        "sup",
        "u",
    ]
)

# \p{} allows to match a character by its Unicode category
# "Zs" is the category "Separator, space".
widont_finder = regex.compile(
    rf"""(
        # must be preceded by:
        (?:</?                            # a closing tag
            (?:{ALLOWED_INLINES})[^>]*>)  # an approved inline opening
            |[^<>\s\.,;:!\?]              # or a nontag/nonspace and not punctuation
        )
        \p{{Zs}}+                      # the space to replace
        ([^<>\s]+                      # must be followed by non-tag non-space characters
        \s*                            # optional white space!
        (</({ALLOWED_INLINES})>\s*)*   # optional closing inline tags with optional white space after each
        ((</({COMPUTED_BLOCKS})>)|$))  # end with a closing block or the end of the string
    """,
    re.VERBOSE,
)


def widont(text, html=False):
    """Replaces the space between the last two words with non-breakable space

    Works in block tags from COMPUTED_BLOCKS and also accounts for
    potential closing inline elements in ALLOWED_INLINES.

    Adapted from:
    https://github.com/mintchaos/typogrify/blob/
    20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368

    Improvements:

    * allow the unbreakable unicode character (new option)
    * consider the support of all pertinent inline and block elements
    * avoid insertion of a non-breakable space if the sentence is a single word
    * externalized test suite
    """
    spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
    output = widont_finder.sub(rf"\1{spacer}\2", text)

    return output