|
|
|
|
|
|
|
|
import re |
|
|
import re |
|
|
|
|
|
from dataclasses import dataclass |
|
|
|
|
|
|
|
|
|
|
|
import regex # for the support of "\p{}" |
|
|
|
|
|
|
|
|
def widont(text): |
|
|
|
|
|
"""Replaces the space between the last two words in a string with `` `` |
|
|
|
|
|
Works in these block tags ``(h1-h6, p, li, dd, dt)`` and also accounts for |
|
|
|
|
|
potential closing inline elements ``a, em, strong, span, b, i, mark`` |
|
|
|
|
|
|
|
|
|
|
|
Extracted from: |
|
|
|
|
|
|
|
|
@dataclass |
|
|
|
|
|
class Charactere: |
|
|
|
|
|
unicode: str |
|
|
|
|
|
html: str |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
NON_BREAKABLE_SPACE = Charactere(unicode="\u00a0", html=" ") |
|
|
|
|
|
|
|
|
|
|
|
# TODISCUSS: part of the configuration? |
|
|
|
|
|
COMPUTED_BLOCKS = "|".join(["p", "h[1-6]", "li", "dt", "dd"]) |
|
|
|
|
|
ALLOWED_INLINES = "|".join(["a", "em", "span", "strong", "i", "b", "mark"]) |
|
|
|
|
|
|
|
|
|
|
|
# \p{} allows to match a character by its Unicode category |
|
|
|
|
|
# "Zs" is the category "Separator, space". |
|
|
|
|
|
widont_finder = regex.compile( |
|
|
|
|
|
rf"""( |
|
|
|
|
|
# must be preceded by: |
|
|
|
|
|
(?:</? # a closing tag |
|
|
|
|
|
(?:{ALLOWED_INLINES})[^>]*>) # an approved inline opening |
|
|
|
|
|
|[^<>\s\.] # or a nontag/nonspace and not a point |
|
|
|
|
|
) |
|
|
|
|
|
\p{{Zs}}+ # the space to replace |
|
|
|
|
|
([^<>\s]+ # must be followed by non-tag non-space characters |
|
|
|
|
|
\s* # optional white space! |
|
|
|
|
|
(</({ALLOWED_INLINES})>\s*)* # optional closing inline tags with optional white space after each |
|
|
|
|
|
((</({COMPUTED_BLOCKS})>)|$)) # end with a closing block or the end of the string |
|
|
|
|
|
""", |
|
|
|
|
|
re.VERBOSE, |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def widont(text, html=False): |
|
|
|
|
|
"""Replaces the space between the last two words with non-breakable space |
|
|
|
|
|
|
|
|
|
|
|
Works in block tags from COMPUTED_BLOCKS and also accounts for |
|
|
|
|
|
potential closing inline elements in ALLOWED_INLINES. |
|
|
|
|
|
|
|
|
|
|
|
Adapted from: |
|
|
https://github.com/mintchaos/typogrify/blob/ |
|
|
https://github.com/mintchaos/typogrify/blob/ |
|
|
20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368 |
|
|
20f693cbbb232ebc27733d9f1721a2cf1e7b25e3/typogrify/filters.py#L315-L368 |
|
|
|
|
|
|
|
|
>>> widont('A very simple test') |
|
|
|
|
|
'A very simple test' |
|
|
|
|
|
Single word items shouldn't be changed |
|
|
|
|
|
>>> widont('Test') |
|
|
|
|
|
'Test' |
|
|
|
|
|
>>> widont(' Test') |
|
|
|
|
|
' Test' |
|
|
|
|
|
>>> widont('<ul><li>Test</p></li><ul>') |
|
|
|
|
|
'<ul><li>Test</p></li><ul>' |
|
|
|
|
|
>>> widont('<ul><li> Test</p></li><ul>') |
|
|
|
|
|
'<ul><li> Test</p></li><ul>' |
|
|
|
|
|
>>> widont('<p>In a couple of paragraphs</p><p>paragraph two</p>') |
|
|
|
|
|
'<p>In a couple of paragraphs</p><p>paragraph two</p>' |
|
|
|
|
|
>>> widont('<h1><a href="#">In a link inside a heading</i> </a></h1>') |
|
|
|
|
|
'<h1><a href="#">In a link inside a heading</i> </a></h1>' |
|
|
|
|
|
>>> widont('<h1><a href="#">In a link</a> followed by other text</h1>') |
|
|
|
|
|
'<h1><a href="#">In a link</a> followed by other text</h1>' |
|
|
|
|
|
Empty HTMLs shouldn't error |
|
|
|
|
|
>>> widont('<h1><a href="#"></a></h1>') |
|
|
|
|
|
'<h1><a href="#"></a></h1>' |
|
|
|
|
|
>>> widont('<div>Divs get no love!</div>') |
|
|
|
|
|
'<div>Divs get no love!</div>' |
|
|
|
|
|
>>> widont('<pre>Neither do PREs</pre>') |
|
|
|
|
|
'<pre>Neither do PREs</pre>' |
|
|
|
|
|
>>> widont('<div><p>But divs with paragraphs do!</p></div>') |
|
|
|
|
|
'<div><p>But divs with paragraphs do!</p></div>' |
|
|
|
|
|
|
|
|
|
|
|
Adaptations: |
|
|
|
|
|
* add the mark element as a potential inline |
|
|
|
|
|
* avoid insertion of a nbsp if the sentence is a single word |
|
|
|
|
|
|
|
|
|
|
|
>>> widont("<p>Avec <mark>mon ami Marc.</mark></p>") |
|
|
|
|
|
'<p>Avec <mark>mon ami Marc.</mark></p>' |
|
|
|
|
|
>>> widont("Vraiment. Bien.") |
|
|
|
|
|
'Vraiment. Bien.' |
|
|
|
|
|
|
|
|
Improvements: |
|
|
|
|
|
|
|
|
|
|
|
* allow the unbreakable unicode character (new option) |
|
|
|
|
|
* add the mark element as a potential inline |
|
|
|
|
|
* avoid insertion of a non-breakable space if the sentence is a single word |
|
|
|
|
|
* externalized test suite |
|
|
""" |
|
|
""" |
|
|
|
|
|
|
|
|
widont_finder = re.compile( |
|
|
|
|
|
r"""((?:</?(?:a|em|span|strong|i|b|mark)[^>]*>)|[^<>\s\.]) # must be preceded by an approved inline opening or closing tag or a nontag/nonspace |
|
|
|
|
|
\s+ # the space to replace |
|
|
|
|
|
([^<>\s]+ # must be followed by non-tag non-space characters |
|
|
|
|
|
\s* # optional white space! |
|
|
|
|
|
(</(a|em|span|strong|i|b|mark)>\s*)* # optional closing inline tags with optional white space after each |
|
|
|
|
|
((</(p|h[1-6]|li|dt|dd)>)|$)) # end with a closing p, h1-6, li or the end of the string |
|
|
|
|
|
""", |
|
|
|
|
|
re.VERBOSE, |
|
|
|
|
|
) |
|
|
|
|
|
output = widont_finder.sub(r"\1 \2", text) |
|
|
|
|
|
|
|
|
spacer = NON_BREAKABLE_SPACE.html if html else NON_BREAKABLE_SPACE.unicode |
|
|
|
|
|
output = widont_finder.sub(rf"\1{spacer}\2", text) |
|
|
|
|
|
|
|
|
return output |
|
|
return output |