44import re
55from collections .abc import Callable , Iterable
66from datetime import date , datetime
7- from typing import Any
7+ from typing import Any , cast
88from urllib .parse import urlsplit
99
1010import babel
1111import babel .core
1212import babel .dates
1313import babel .numbers
14+ import genshi
15+ import genshi .filters
1416import web
1517from babel .core import Locale
16-
17- try :
18- import genshi
19- import genshi .filters
20- except ImportError :
21- genshi = None
22-
23- try :
24- from bs4 import BeautifulSoup
25- except ImportError :
26- BeautifulSoup = None
18+ from bs4 import BeautifulSoup
2719
2820from infogami import config
2921from infogami .infobase .client import Nothing
6052__docformat__ = "restructuredtext en"
6153
6254
63- def sanitize (html : str , encoding : str = 'utf8' ) -> str :
55+ def sanitize (html : str , encoding : str = 'utf8' , beautify : bool = True ) -> str :
6456 """Removes unsafe tags and attributes from html and adds
6557 ``rel="nofollow"`` attribute to all external links.
6658 Using encoding=None if passing Unicode strings.
6759 encoding="utf8" matches default format for earlier versions of Genshi
6860 https://genshi.readthedocs.io/en/latest/upgrade/#upgrading-from-genshi-0-6-x-to-the-development-version
6961 """
7062
71- # Can't sanitize unless genshi module is available
72- if genshi is None :
73- return html
74-
7563 def get_nofollow (name , event ):
7664 attrs = event [1 ][1 ]
7765
@@ -82,29 +70,25 @@ def get_nofollow(name, event):
8270 return 'nofollow'
8371
8472 try :
85- html = genshi .HTML (html , encoding = encoding )
73+ html_stream = genshi .HTML (html , encoding = encoding )
8674
8775 # except (genshi.ParseError, UnicodeDecodeError, UnicodeError) as e:
8876 # don't catch Unicode errors so we can tell if we're getting bytes
8977 except genshi .ParseError :
90- if BeautifulSoup :
91- # Bad html. Tidy it up using BeautifulSoup
78+ # Bad html. Tidy it up using BeautifulSoup
79+ if beautify :
9280 html = str (BeautifulSoup (html , "lxml" ))
93- try :
94- html = genshi .HTML (html )
95- except Exception :
96- # Failed to sanitize.
97- # We can't do any better than returning the original HTML, without sanitizing.
98- return html
81+ # Avoid infinite recursion by disabling beautify on the next call
82+ return sanitize (html , encoding = encoding , beautify = False )
9983 else :
10084 raise
10185
10286 stream = (
103- html
87+ html_stream
10488 | genshi .filters .HTMLSanitizer ()
10589 | genshi .filters .Transformer ("//a" ).attr ("rel" , get_nofollow )
10690 )
107- return stream .render ()
91+ return cast ( str , stream .render () )
10892
10993
11094class NothingEncoder (json .JSONEncoder ):
0 commit comments