Source code for timApp.document.yamlblock

import re
from copy import deepcopy
from enum import Enum
from textwrap import shorten
from typing import Generator

import yaml
from yaml import YAMLError, CSafeLoader, Event, AliasEvent, NodeEvent

from timApp.util.utils import count_chars_from_beginning


[docs]class BlockEndMissingError(YAMLError):
    def __init__(self, end_str: str) -> None:
        super().__init__(f"Missing multiline terminator: {end_str}")
        self.end_str = end_str


[docs]class DuplicateKeyMergeHintError(YAMLError):
    def __init__(self, key: str):
        super().__init__(
            f'Using merge hints in a key ("{key}") having same name in different levels is not currently supported'
        )


[docs]class InvalidIndentError(YAMLError):
    def __init__(self, line: str):
        super().__init__(
            f'The line "{shorten(line, width=30, placeholder="...")}" '
            f"must be indented at least as much as the first line."
        )


[docs]class MergeStyle(Enum):
    Replace = "r"
    Append = "a"
    ReplaceIfNotExist = "r?"


YamlMergeInfo = dict[str, MergeStyle]

yaml_loader = CSafeLoader


[docs]class YamlBlock:
    def __init__(self, values: dict = None, merge_hints: YamlMergeInfo | None = None):
        self.values = values if values is not None else {}
        self.merge_hints = merge_hints

    def __eq__(self, o: object) -> bool:
        if isinstance(o, self.__class__):
            return self.__dict__ == o.__dict__
        elif isinstance(o, dict):
            return self.values == o
        return NotImplemented

    def __repr__(self):
        return f"{self.__class__.__name__}({self.__dict__})"

    def __setitem__(self, key: str, value):
        self.values.__setitem__(key, value)

    def __getitem__(self, item: str):
        return self.values.__getitem__(item)

[docs]    def get(self, key: str, default=None):
        return self.values.get(key, default)

[docs]    @staticmethod
    def from_markdown(md: str):
        md = strip_code_block(md)
        values, hints = parse_yaml(md)
        return YamlBlock(values=values, merge_hints=hints)

[docs]    def merge_with(self, other: "YamlBlock"):
        new_vals = deepcopy(self.values)
        merge(new_vals, other.values, other.merge_hints)
        return YamlBlock(values=new_vals, merge_hints=other.merge_hints)

[docs]    def to_markdown(self):
        return yaml.dump(self.values, default_flow_style=False)


missing_space_after_colon = re.compile(
    "^[ \t]*[^ :[\\]()'\"]*:[^ /=:]"
)  # kissa:istuu mutta ei http://koti tai a:=5
multiline_unindented_string = re.compile(
    r"""^( *)([^ :"']+): *(\|[+-]?)([0-9]*) *([^ 0-9+-]+[^ ]*)( (a|r|r\?))? *$"""
)  # program: ||| or  program: |!!!
normal_multiline_indented_string = re.compile(
    """^( *)([^ :"']+): *([|>][+-]?)([0-9]*) *$"""
)  # program: | or  program: |+2
multiline_unindented_obj_string = re.compile(
    """^( *)([^ :"']+): *@ *([^ 0-9+-]+[^ ]*)$"""
)  # object: @|| or  object: @!!!


[docs]def strip_code_block(md: str) -> str:
    code_block_marker = get_code_block_str(md)
    if len(code_block_marker) < 3:
        return md
    # Strip only after verifying we have a code block, otherwise we might break YAML spacing
    md = md.strip()
    return md.split("\n", 1)[1].rstrip(f"\n{code_block_marker}")


[docs]def get_code_block_str(md: str) -> str:
    md = md.lstrip()  # Strip to ensure correct count
    code_block_marker = "`" * count_chars_from_beginning(md, "`")
    return code_block_marker


[docs]def compare_same(s1: str, s2: str, n: int) -> bool:
    """
    :param s1: string than can contain max n spaces at the begining
    :param s2: string to oompare, no spaces in the begining
    :param n: how many spaces allowed to caintain still to be same
    :return: True is same False other
    """
    if not n:
        n = 0
    i = s1.find(s2)
    if i < 0 or i > n:  # No match or too far
        return False
    if i + len(s2) != len(s1):  # The end part is not exactly same
        return False
    return count_chars_from_beginning(s1, " ") <= n


[docs]def correct_obj(text: str) -> str:
    """
    Also gives an other way to write unindented object attributes, by starting
    the attribute like: `object: @!!`  (`!!` could any combinations of chars except space
    and ending it by `!!` in first column.

    :param text: Text to convert to proper yaml.
    :return: Text that is proper yaml obj
    """
    """
     Problem analyze:
        il = atribute line indent len,   fi = first line indent
        indent = string that must be inserted to every line, len = it's lenght

        |a1: @!            il = 0, fi = 0     il-fi+1  = 1 
        |a            => indent=" "   len 1 

        |a1: @!            il = 0, fi = 4     il-fi+1  = -3 
        |    a        => indent=""    len 0

        |a1: @!            il = 0, fi = 1     il-fi+1  = 2
        | a           => indent=""    len 0

        | a1: @!           il = 1  fi = 1     il-fi+1  = 2
        | a           => indent=" "   len 1

        |  a1: @!          il = 2  fi = 0     il-fi+1  = 2
        |a            => indent="   " len 3

        |  a1: @!          il = 2  fi = 1     il-fi+1  = 2  
        | a           => indent="  "  len 2

    Note: This is almost the same as correct_yaml-body but no hint handling.
    It may be possible to put them just one code but then it is even more difficult to
    understand what is happening.  One small difference is that in this code we look for @
    and do not put it back as we do with | in correct_yaml.

    And the idea is to run this as many times as there is no changes any more in object level.
    The objects could be nested, string can not be.  There is still a problem that if string
    includes object start, this function will destroy it, f.ex:

        |a1: |!!
        |cat
        |  o1: @!
        |a:1
        |!
        |!!

    even should be    

        |a1: |
        | cat
        |   o1: @!
        | a:1

    There are testcases for all these things in timApp/tests/unit/test_correct_yaml.py

    """
    # don't use splitlines here - it loses the possible last trailing newline character, and we don't want that.
    while True:  # repeat until n == 0
        lines = text.split("\n")
        n = 0  # count how many unindented object block found and how many handled
        s = ""
        multiline = False
        multiline_string = False
        end_str = ""
        indent = None
        multiline_first_indent = None
        original_indent_len = 0
        max_allowed_spaces = 0
        lf = ""
        for line in lines:
            line = line.rstrip()
            r2 = multiline_unindented_string.match(line)
            if (
                r2 and not multiline
            ):  # we have multiline string and we do nothing until it ends
                end_str = r2.group(5)
                indent = r2.group(1)
                max_allowed_spaces = original_indent_len = len(indent)
                indent = ""  # no changes while in multilinestring
                multiline_string = multiline = True
                multiline_first_indent = None
                s = s + lf + line
                lf = "\n"
                continue
            else:
                r = multiline_unindented_obj_string.match(line)
                if r and not multiline_string:
                    n += 1
                if r and not multiline:
                    end_str = r.group(3)
                    indent = r.group(1)
                    max_allowed_spaces = original_indent_len = len(indent)
                    multiline = True
                    multiline_first_indent = None
                    line, _ = line.split("@", 1)
                    s = s + lf + line.rstrip()
                    lf = "\n"
                    continue
            if multiline:
                if compare_same(line, end_str, max_allowed_spaces):
                    multiline = False
                    if multiline_string:
                        s = s + lf + line
                        multiline_string = False
                    else:  # one more unindented object handled
                        n -= 1
                    continue
                if multiline_first_indent is None:
                    multiline_first_indent = count_chars_from_beginning(line, " ")
                    needed_indent_length = max(
                        original_indent_len - multiline_first_indent + 1, 0
                    )
                    if (
                        not multiline_string
                    ):  # we do not touch multiline strings in this function
                        indent = " " * needed_indent_length
                    max_allowed_spaces = original_indent_len + multiline_first_indent
                else:
                    if line and multiline_first_indent > count_chars_from_beginning(
                        line, " "
                    ):
                        raise InvalidIndentError(line)
                line = indent + line
            s = s + lf + line
            lf = "\n"
        if multiline:
            raise BlockEndMissingError(end_str)
        if n == 0:
            return s
        text = s
    # until n = 0


[docs]def correct_yaml(text: str) -> tuple[str, YamlMergeInfo]:
    """Inserts missing spaces after `:` Like  `width:20` => `width: 20`
    Also gives an other way to write multiline attributes, by starting
    the multiline like: `program: |!!`  (`!!` could any combinations of chars except space
    and ending it by `!!` in first column.

    :param text: Text to convert to proper yaml.
    :return: Text that is proper yaml.
    """
    # don't use splitlines here - it loses the possible last trailing newline character, and we don't want that.
    if (
        text.find(":@") >= 0
    ):  # we suppose that using this is so rare that it is cheaper to avoid
        text = correct_obj(text)  # this call as much as possible
    lines = text.split("\n")
    s = ""
    multiline = False
    end_str = ""
    indent = None
    merge_hints = {}
    encountered_keys = set()
    multiline_first_indent = None
    original_indent_len = 0
    max_allowed_spaces = 0
    lf = ""
    end_match = None
    for line in lines:
        line = line.rstrip()
        if end_match:  # to protect : space insertion while in normal | or >
            if not end_match.match(line):
                s = s + lf + line
                lf = "\n"
                continue
            end_match = None
        if missing_space_after_colon.match(line) and not multiline:
            line = line.replace(":", ": ", 1)
        r = normal_multiline_indented_string.match(line)
        if r and not multiline:
            indent = r.group(1)
            spacereg = ""
            spaces = len(indent)
            if spaces != 0:
                spacereg = " {0," + str(spaces) + "}"
            em = "^" + spacereg + "[^ ]+.*$"
            end_match = re.compile(em)
            s = s + lf + line
            lf = "\n"
            continue
        r = multiline_unindented_string.match(line)
        if r and not multiline:
            end_str = r.group(5)
            indent = r.group(1)
            multiline_first_indent = None
            fls = 0
            if r.group(4):
                fls = int(r.group(4))
                multiline_first_indent = 0
                needed_indent_length = max(original_indent_len - fls + 1, 0)
                indent = " " * needed_indent_length
            original_indent_len = len(indent)
            max_allowed_spaces = original_indent_len + fls
            multiline = True
            line, _ = line.split("|", 1)
            key = r.group(2)
            hint = r.group(7)
            if hint in ("a", "r", "r?"):
                if key in encountered_keys:
                    raise DuplicateKeyMergeHintError(key)
                merge_hints[key] = MergeStyle(hint)
            s = s + lf + line + r.group(3) + r.group(4)
            lf = "\n"
            encountered_keys.add(key)
            continue
        if multiline:
            if compare_same(line, end_str, max_allowed_spaces):
                multiline = False
                continue
            if multiline_first_indent is None:
                multiline_first_indent = count_chars_from_beginning(line, " ")
                needed_indent_length = max(
                    original_indent_len - multiline_first_indent + 1, 0
                )
                indent = " " * needed_indent_length
                max_allowed_spaces = original_indent_len + multiline_first_indent
            else:
                if line and multiline_first_indent > count_chars_from_beginning(
                    line, " "
                ):
                    raise InvalidIndentError(line)
            line = indent + line
        else:
            key = line.split(":", 1)[0].strip()
            if key:
                if key in encountered_keys and key in merge_hints:
                    raise DuplicateKeyMergeHintError(key)
                encountered_keys.add(key)
        s = s + lf + line
        lf = "\n"
    if multiline:
        raise BlockEndMissingError(end_str)
    return s, merge_hints


[docs]def verify_anchor_depth(text: str, max_depth=3) -> None:
    """
    Verifies that the given YAML file does not include too deep anchor references.

    Anchor references can be used for quadratic growth DoS attacks when YAML is being iterated through.
    The method verifies that the maximum reference depth is within the provided value. Default max depth is 3.

    If YAML includes deep references, YAMLError is thrown.

    :param text: YAML to check
    :param max_depth: Maximum anchor reference depth
    """
    parser: Generator[Event] = yaml.parse(text, yaml_loader)
    context_depths = {}
    current_context = None
    for p in parser:
        if isinstance(p, AliasEvent) and current_context:
            depths = context_depths[current_context]
            if p.anchor not in depths:
                depth = max([*context_depths[p.anchor].values(), 0]) + 1
                if depth > max_depth:
                    raise YAMLError("Markup includes too deep anchor references")
                depths[p.anchor] = depth
            continue
        if isinstance(p, NodeEvent) and p.anchor is not None:
            context_depths[p.anchor] = {}
            current_context = p.anchor


[docs]def parse_yaml(text: str) -> tuple[dict, YamlMergeInfo]:
    """Parses the specified text as (customized) YAML.

    :param text: The text to parse.
    :return: The parsed YAML as a dict.
    """

    text, hints = correct_yaml(text)
    verify_anchor_depth(text)

    values = yaml.load(text, yaml_loader)
    if isinstance(values, str):
        raise YAMLError("Markup must not be a mere string.")
    # empty YAML is equal to null, so we avoid that by returning {} in that case
    return (values or {}), hints


[docs]def merge(a: dict, b: dict, merge_info: YamlMergeInfo | None = None):
    """Merges two dictionaries recursively. Stores the result in the first dictionary.

    :param merge_info: The merge hints to use while merging.
    :param a: The first dictionary.
    :param b: The second dictionary.

    """
    return __merge_helper(a, b, 0, merge_info=merge_info)


default_append_keys = {"css", "themes"}


def __merge_helper(
    a: dict, b: dict, depth: int = 0, merge_info: YamlMergeInfo | None = None
):
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                __merge_helper(a[key], b[key], depth + 1, merge_info)
            elif a[key] == b[key]:
                pass
            elif type(a[key]) != type(b[key]):
                a[key] = b[key]
            else:
                m = (
                    MergeStyle.Append
                    if key in default_append_keys
                    else MergeStyle.Replace
                )
                if merge_info:
                    m = merge_info.get(key, m)
                if m == MergeStyle.Replace:
                    a[key] = b[key]
                elif m == MergeStyle.Append:
                    a[key] += b[key]
        else:
            a[key] = b[key]
TIM documentation

Source code for timApp.document.yamlblock