Source code for tim_common.html_sanitize

import re
from typing import Any

import lxml
import lxml.etree
from lxml.etree import ParserError
from lxml.html import tostring, fragment_fromstring, document_fromstring
from lxml.html.clean import Cleaner

    # plugin components:
    # raw AngularJS components:

    "*": ["class", "id", "align"],
    "video": ["src", "controls"],
    "abbr": ["title"],
    "acronym": ["title"],
    "img": ["src", "width", "height"],
    "a": ["href", "title", "target"],

TIM_SAFE_ATTRS = frozenset(
        # tim-rights-editor
        # tim-self-expire
        # tim-table
        # tim-add-member
        # tim-goto-link
        # viz and vars:
        # tim-message-list-admin
        # tim-archive-header
        # tim-archive-footer
        # tim-message-send
        # tim-search-button

c_no_style = Cleaner(

c_with_styles = Cleaner(
    allow_tags=TIM_SAFE_TAGS + ["style"],

# NOTE: lxml cleaner is a LOT faster than bleach.
[docs]def sanitize_html(html_string: str, allow_styles: bool = False) -> str: cleaner = c_with_styles if allow_styles else c_no_style return sanitize_with_cleaner(html_string, cleaner)
# Taken from LXML looks_like_full_html = re.compile(r"^\s*<(?:html|!doctype)", re.I).match # NOTE: lxml now always removed data:image/svg+xml because of possible XSS: # See: # However, in TIM, data URLs are used for plugins and Tex2SVG math content # In our case, we can generally be pretty sure that the data URL contains only SVG # Moreover, scripts embedded in SVG as data URLs are not executed unless the user opens the image: # See: # This is enough for our use case. Because lxml does not provide a switch to disable removing data URLs # (or even sanitizing them), we have to do it manually. replace_data_svg = re.compile(r"data:image/svg\+xml;base64,", re.I).sub replace_data_escaped = re.compile(r"data:image/escaped;base64,", re.I).sub
[docs]def escape_data_svg(svg_string: str) -> str: """Converts data:image/svg+xml;base64, to data:image/safe;base64,""" return replace_data_svg("data:image/escaped;base64,", svg_string)
[docs]def unescape_data_svg(svg_string: str) -> str: """Converts data:image/safe;base64, back to data:image/svg+xml;base64,""" return replace_data_escaped("data:image/svg+xml;base64,", svg_string)
[docs]def fromstring(html_string: str) -> Any: """ Parses string into an LXML document or element. Unlike LXML's fromstring, calls document_fromstring or fragment_fromstring, based on whether the string looks like a full document, or just a fragment. :param html_string: String to parse :return: An LXML document """ if looks_like_full_html(html_string): return document_fromstring(html_string) try: return fragment_fromstring(html_string) except ( ParserError, TypeError, ): # TypeError is a hack to deal with a bug in lxml return fragment_fromstring(html_string, create_parent="div")
[docs]def sanitize_with_cleaner(html_string: str, cleaner: Cleaner) -> str: try: html_string = escape_data_svg(html_string) doc = fromstring(html_string) cleaner(doc) cleaned = tostring(doc, encoding="ascii").decode("ascii") cleaned = unescape_data_svg(cleaned) return strip_div(cleaned) except lxml.etree.ParserError: # Thrown if the HTML string is empty return "" except lxml.etree.XMLSyntaxError: # Not yet sure why thrown return "" except ValueError: # Thrown if XML has an encoding declaration return ""
[docs]def strip_div(s: str) -> str: if s.startswith("<div>") and s.endswith("</div>"): return s[5:-6] else: return s
[docs]def presanitize_html_body(html_string: str) -> str: """ Apply basic <html> tag sanitization. This may be needed in cases where user-given yet un-sanitized HTML is parsed by lxml before proper sanitization. :param html_string: HTML to sanitize :return: HTML string with <html> tag sanitized in a basic way for LXML to parse it """ return html_string.replace("<html", "&lt;html")