Browse Source

Rewrite a good part of widont

master
David Larlet 2 years ago
parent
commit
cecc422af1
3 changed files with 124 additions and 53 deletions
  1. 1
    1
      site.py
  2. 76
    0
      test_widont.py
  3. 47
    52
      widont.py

+ 1
- 1
site.py View File

@@ -290,7 +290,7 @@ class Page:
md = markdown_with_h2_anchors if with_h2_anchors else markdown_with_img_sizes
for file_name in sorted(each_markdown_from(source)):
result = md.read(source / file_name)
result = widont(result)
result = widont(result, html=True)
# Extract (and remove) the title from the generated page.
title, content = result.split("</h1>", 1)
h1_opening_size = len("<h1>")

+ 76
- 0
test_widont.py View File

@@ -0,0 +1,76 @@
import pytest


@pytest.mark.parametrize(
"in_,out_unicode, out_html",
[
("", "", ""),
(" ", " ", " "),
("A very simple test", "A very simple\u00a0test", "A very simple&nbsp;test"),
("Test", "Test", "Test"),
(" Test", " Test", " Test"),
(
"<ul><li>Test</p></li><ul>",
"<ul><li>Test</p></li><ul>",
"<ul><li>Test</p></li><ul>",
),
(
"<ul><li> Test</p></li><ul>",
"<ul><li> Test</p></li><ul>",
"<ul><li> Test</p></li><ul>",
),
(
"<p>In a couple of paragraphs</p><p>paragraph two</p>",
"<p>In a couple of\u00a0paragraphs</p><p>paragraph\u00a0two</p>",
"<p>In a couple of&nbsp;paragraphs</p><p>paragraph&nbsp;two</p>",
),
(
'<h1><a href="#">In a link inside a heading</i> </a></h1>',
'<h1><a href="#">In a link inside a\u00a0heading</i> </a></h1>',
'<h1><a href="#">In a link inside a&nbsp;heading</i> </a></h1>',
),
(
'<h1><a href="#">In a link</a> followed by other text</h1>',
'<h1><a href="#">In a link</a> followed by other\u00a0text</h1>',
'<h1><a href="#">In a link</a> followed by other&nbsp;text</h1>',
),
(
'<h1><a href="#"></a></h1>',
'<h1><a href="#"></a></h1>',
'<h1><a href="#"></a></h1>',
),
(
"<div>Divs get no love!</div>",
"<div>Divs get no love!</div>",
"<div>Divs get no love!</div>",
),
(
"<pre>Neither do PREs</pre>",
"<pre>Neither do PREs</pre>",
"<pre>Neither do PREs</pre>",
),
(
"<div><p>But divs with paragraphs do!</p></div>",
"<div><p>But divs with paragraphs\u00a0do!</p></div>",
"<div><p>But divs with paragraphs&nbsp;do!</p></div>",
),
(
"<p>Avec <mark>mon ami Marc.</mark></p>",
"<p>Avec <mark>mon ami\u00a0Marc.</mark></p>",
"<p>Avec <mark>mon ami&nbsp;Marc.</mark></p>",
),
(
"Vraiment. Bien.",
"Vraiment. Bien.",
"Vraiment. Bien.",
),
],
)
def test_widont(in_, out_unicode, out_html):
from widont import widont

assert widont(in_) == out_unicode
assert widont(out_unicode) == out_unicode
assert widont(in_, html=True) == out_html
# TODO
# assert widont(out_html, html=True) == out_html

+ 47
- 52
widont.py View File

@@ -1,63 +1,58 @@
import re
from dataclasses import dataclass

import regex # for the support of "\p{}"

def widont(text):
"""Replaces the space between the last two words in a string with ``&nbsp;``
Works in these block tags ``(h1-h6, p, li, dd, dt)`` and also accounts for
potential closing inline elements ``a, em, strong, span, b, i, mark``

Extracted from:
@dataclass
class Charactere:
unicode: str
html: str


NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html="&nbsp;")

# TODISCUSS: part of the configuration?
COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"])
ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"])

# \p{} allows to match a character by its Unicode category
# "Zs" is the category "Separator, space".
widont_finder = regex.compile(
rf"""(
# must be preceded by:
(?:</? # a closing tag
(?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening
|[^<>\s\.] # or a nontag/nonspace and not a point
)
\p{{Zs}}+ # the space to replace
([^<>\s]+ # must be followed by non-tag non-space characters
\s* # optional white space!
(</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each
((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string
""",
re.VERBOSE,
)


def widont(text, html=False):
"""Replaces the space between the last two words with non-breakable space

Works in block tags from COMPUTED_BLOCKS and also accounts for
potential closing inline elements in ALLOWED_INLINES.

Adapted from:
https://github.com/mintchaos/typogrify/blob/
20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368

>>> widont('A very simple test')
'A very simple&nbsp;test'
Single word items shouldn't be changed
>>> widont('Test')
'Test'
>>> widont(' Test')
' Test'
>>> widont('<ul><li>Test</p></li><ul>')
'<ul><li>Test</p></li><ul>'
>>> widont('<ul><li> Test</p></li><ul>')
'<ul><li> Test</p></li><ul>'
>>> widont('<p>In a couple of paragraphs</p><p>paragraph two</p>')
'<p>In a couple of&nbsp;paragraphs</p><p>paragraph&nbsp;two</p>'
>>> widont('<h1><a href="#">In a link inside a heading</i> </a></h1>')
'<h1><a href="#">In a link inside a&nbsp;heading</i> </a></h1>'
>>> widont('<h1><a href="#">In a link</a> followed by other text</h1>')
'<h1><a href="#">In a link</a> followed by other&nbsp;text</h1>'
Empty HTMLs shouldn't error
>>> widont('<h1><a href="#"></a></h1>')
'<h1><a href="#"></a></h1>'
>>> widont('<div>Divs get no love!</div>')
'<div>Divs get no love!</div>'
>>> widont('<pre>Neither do PREs</pre>')
'<pre>Neither do PREs</pre>'
>>> widont('<div><p>But divs with paragraphs do!</p></div>')
'<div><p>But divs with paragraphs&nbsp;do!</p></div>'

Adaptations:
* add the mark element as a potential inline
* avoid insertion of a nbsp if the sentence is a single word

>>> widont("<p>Avec <mark>mon ami Marc.</mark></p>")
'<p>Avec <mark>mon ami&nbsp;Marc.</mark></p>'
>>> widont("Vraiment. Bien.")
'Vraiment. Bien.'
Improvements:

* allow the unbreakable unicode character (new option)
* add the mark element as a potential inline
* avoid insertion of a non-breakable space if the sentence is a single word
* externalized test suite
"""

widont_finder = re.compile(
r"""((?:</?(?:a|em|span|strong|i|b|mark)[^>]*>)|[^<>\s\.]) # must be preceded by an approved inline opening or closing tag or a nontag/nonspace
\s+ # the space to replace
([^<>\s]+ # must be followed by non-tag non-space characters
\s* # optional white space!
(</(a|em|span|strong|i|b|mark)>\s*)* # optional closing inline tags with optional white space after each
((</(p|h[1-6]|li|dt|dd)>)|$)) # end with a closing p, h1-6, li or the end of the string
""",
re.VERBOSE,
)
output = widont_finder.sub(r"\1&nbsp;\2", text)
spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode
output = widont_finder.sub(rf"\1{spacer}\2", text)

return output

Loading…
Cancel
Save