2 years ago · cecc422af1
--- a/site.py
+++ b/site.py
@@ -290,7 +290,7 @@ class Page:
        md = markdown_with_h2_anchors if with_h2_anchors else markdown_with_img_sizes
        for file_name in sorted(each_markdown_from(source)):
            result = md.read(source / file_name)
            result = widont(result)
            result = widont(result, html=True)
            # Extract (and remove) the title from the generated page.
            title, content = result.split("</h1>", 1)
            h1_opening_size = len("<h1>")
--- a/test_widont.py
+++ b/test_widont.py
@@ -0,0 +1,76 @@
 import pytest


@pytest.mark.parametrize(
    "in_,out_unicode, out_html",
    [
        ("", "", ""),
        (" ", " ", " "),
        ("A very simple test", "A very simple\u00a0test", "A very simple&nbsp;test"),
        ("Test", "Test", "Test"),
        (" Test", " Test", " Test"),
        (
            "<ul><li>Test</p></li><ul>",
            "<ul><li>Test</p></li><ul>",
            "<ul><li>Test</p></li><ul>",
        ),
        (
            "<ul><li> Test</p></li><ul>",
            "<ul><li> Test</p></li><ul>",
            "<ul><li> Test</p></li><ul>",
        ),
        (
            "<p>In a couple of paragraphs</p><p>paragraph two</p>",
            "<p>In a couple of\u00a0paragraphs</p><p>paragraph\u00a0two</p>",
            "<p>In a couple of&nbsp;paragraphs</p><p>paragraph&nbsp;two</p>",
        ),
        (
            '<h1><a href="#">In a link inside a heading</i> </a></h1>',
            '<h1><a href="#">In a link inside a\u00a0heading</i> </a></h1>',
            '<h1><a href="#">In a link inside a&nbsp;heading</i> </a></h1>',
        ),
        (
            '<h1><a href="#">In a link</a> followed by other text</h1>',
            '<h1><a href="#">In a link</a> followed by other\u00a0text</h1>',
            '<h1><a href="#">In a link</a> followed by other&nbsp;text</h1>',
        ),
        (
            '<h1><a href="#"></a></h1>',
            '<h1><a href="#"></a></h1>',
            '<h1><a href="#"></a></h1>',
        ),
        (
            "<div>Divs get no love!</div>",
            "<div>Divs get no love!</div>",
            "<div>Divs get no love!</div>",
        ),
        (
            "<pre>Neither do PREs</pre>",
            "<pre>Neither do PREs</pre>",
            "<pre>Neither do PREs</pre>",
        ),
        (
            "<div><p>But divs with paragraphs do!</p></div>",
            "<div><p>But divs with paragraphs\u00a0do!</p></div>",
            "<div><p>But divs with paragraphs&nbsp;do!</p></div>",
        ),
        (
            "<p>Avec <mark>mon ami Marc.</mark></p>",
            "<p>Avec <mark>mon ami\u00a0Marc.</mark></p>",
            "<p>Avec <mark>mon ami&nbsp;Marc.</mark></p>",
        ),
        (
            "Vraiment. Bien.",
            "Vraiment. Bien.",
            "Vraiment. Bien.",
        ),
    ],
 )
 def test_widont(in_, out_unicode, out_html):
    from widont import widont

    assert widont(in_) == out_unicode
    assert widont(out_unicode) == out_unicode
    assert widont(in_, html=True) == out_html
    # TODO
    # assert widont(out_html, html=True) == out_html
--- a/widont.py
+++ b/widont.py
@@ -1,63 +1,58 @@
 import re
 from dataclasses import dataclass

 import regex  # for the support of "\p{}"

 def widont(text):
    """Replaces the space between the last two words in a string with ``&nbsp;``
    Works in these block tags ``(h1-h6, p, li, dd, dt)`` and also accounts for
    potential closing inline elements ``a, em, strong, span, b, i, mark``

    Extracted from:
@dataclass
 class Charactere:
    unicode: str
    html: str


 NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html="&nbsp;")

 # TODISCUSS: part of the configuration?
 COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
 ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"])

 # \p{} allows to match a character by its Unicode category
 # "Zs" is the category "Separator, space".
 widont_finder = regex.compile(
    rf"""(
        # must be preceded by:
        (?:</?                            # a closing tag
            (?:{ALLOWED_INLINES})[^>]*>)  # an approved inline opening
            |[^<>\s\.]                    # or a nontag/nonspace and not a point
        )
        \p{{Zs}}+                      # the space to replace
        ([^<>\s]+                      # must be followed by non-tag non-space characters
        \s*                            # optional white space!
        (</({ALLOWED_INLINES})>\s*)*   # optional closing inline tags with optional white space after each
        ((</({COMPUTED_BLOCKS})>)|$))  # end with a closing block or the end of the string
    """,
    re.VERBOSE,
 )


 def widont(text, html=False):
    """Replaces the space between the last two words with non-breakable space

    Works in block tags from COMPUTED_BLOCKS and also accounts for
    potential closing inline elements in ALLOWED_INLINES.

    Adapted from:
    https://github.com/mintchaos/typogrify/blob/
    20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368

    >>> widont('A very simple test')
    'A very simple&nbsp;test'
    Single word items shouldn't be changed
    >>> widont('Test')
    'Test'
    >>> widont(' Test')
    ' Test'
    >>> widont('<ul><li>Test</p></li><ul>')
    '<ul><li>Test</p></li><ul>'
    >>> widont('<ul><li> Test</p></li><ul>')
    '<ul><li> Test</p></li><ul>'
    >>> widont('<p>In a couple of paragraphs</p><p>paragraph two</p>')
    '<p>In a couple of&nbsp;paragraphs</p><p>paragraph&nbsp;two</p>'
    >>> widont('<h1><a href="#">In a link inside a heading</i> </a></h1>')
    '<h1><a href="#">In a link inside a&nbsp;heading</i> </a></h1>'
    >>> widont('<h1><a href="#">In a link</a> followed by other text</h1>')
    '<h1><a href="#">In a link</a> followed by other&nbsp;text</h1>'
    Empty HTMLs shouldn't error
    >>> widont('<h1><a href="#"></a></h1>')
    '<h1><a href="#"></a></h1>'
    >>> widont('<div>Divs get no love!</div>')
    '<div>Divs get no love!</div>'
    >>> widont('<pre>Neither do PREs</pre>')
    '<pre>Neither do PREs</pre>'
    >>> widont('<div><p>But divs with paragraphs do!</p></div>')
    '<div><p>But divs with paragraphs&nbsp;do!</p></div>'

    Adaptations:
    * add the mark element as a potential inline
    * avoid insertion of a nbsp if the sentence is a single word

    >>> widont("<p>Avec <mark>mon ami Marc.</mark></p>")
    '<p>Avec <mark>mon ami&nbsp;Marc.</mark></p>'
    >>> widont("Vraiment. Bien.")
    'Vraiment. Bien.'
    Improvements:

    * allow the unbreakable unicode character (new option)
    * add the mark element as a potential inline
    * avoid insertion of a non-breakable space if the sentence is a single word
    * externalized test suite
    """

    widont_finder = re.compile(
        r"""((?:</?(?:a|em|span|strong|i|b|mark)[^>]*>)|[^<>\s\.]) # must be preceded by an approved inline opening or closing tag or a nontag/nonspace
            \s+                                                    # the space to replace
            ([^<>\s]+                                              # must be followed by non-tag non-space characters
            \s*                                                    # optional white space!
            (</(a|em|span|strong|i|b|mark)>\s*)*                   # optional closing inline tags with optional white space after each
            ((</(p|h[1-6]|li|dt|dd)>)|$))                          # end with a closing p, h1-6, li or the end of the string
        """,
        re.VERBOSE,
    )
    output = widont_finder.sub(r"\1&nbsp;\2", text)
    spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
    output = widont_finder.sub(rf"\1{spacer}\2", text)

    return output