"""
Functions for HTML sanitization used by csplugin.
"""
from typing import Optional
import html5lib
from html5lib.constants import namespaces
from html5lib.filters import sanitizer
from html5lib.filters.sanitizer import (
allowed_elements,
allowed_attributes,
allowed_css_properties,
)
from html5lib.serializer import HTMLSerializer
from lxml.html.clean import Cleaner
from tim_common.html_sanitize import sanitize_html, sanitize_with_cleaner
cs_allowed_tags = [
"em",
"strong",
"tt",
"a",
"b",
"code",
"i",
"kbd",
"span",
"li",
"ul",
"ol",
]
cs_allowed_attrs = frozenset(
[
"href",
"class",
]
)
cs_cleaner = Cleaner(
allow_tags=cs_allowed_tags,
comments=False,
forms=False,
remove_unknown_tags=False,
safe_attrs=cs_allowed_attrs,
)
[docs]def allow_minimal(s: str | None) -> str | None:
if not s:
return s
return sanitize_with_cleaner(s, cs_cleaner)
cs_svg_tim_allowed_elements = set(allowed_elements).union(
{
(namespaces["svg"], "feBlend"),
(namespaces["svg"], "feColorMatrix"),
(namespaces["svg"], "feGaussianBlur"),
(namespaces["svg"], "feOffset"),
(namespaces["svg"], "filter"),
(namespaces["svg"], "style"),
}
)
cs_svg_allowed_attributes = set(allowed_attributes).union(
{
(None, "filter"),
(None, "in"),
(None, "in2"),
(None, "lengthAdjust"),
(None, "mode"),
(None, "result"),
(None, "stdDeviation"),
(None, "textLength"),
}
)
tim_allowed_css_props = set(allowed_css_properties).union({"stroke-dasharray"})
[docs]def svg_sanitize(s: str) -> str:
dom = html5lib.parseFragment(s, treebuilder="dom")
walker = html5lib.getTreeWalker("dom")
stream = walker(dom)
stream = sanitizer.Filter(
stream,
allowed_elements=cs_svg_tim_allowed_elements,
allowed_attributes=cs_svg_allowed_attributes,
allowed_css_properties=tim_allowed_css_props,
)
serializer = HTMLSerializer(quote_attr_values="always")
s = serializer.render(stream)
return s
[docs]def cs_min_sanitize(s: str) -> str:
if s.find("<svg") >= 0:
return svg_sanitize(s)
s = s.replace("<", "<").replace(">", ">")
# s = tim_sanitize(s)
return s
[docs]def tim_sanitize(s: str | None) -> str | None:
if not s:
return s
return sanitize_html(s)