Source code for timApp.document.documentparser

import re
from typing import Optional

from timApp.document.attributeparser import AttributeParser
from timApp.document.documentparseroptions import DocumentParserOptions
from timApp.document.randutils import hashfunc, random_id, is_valid_id
from timApp.document.validationresult import (
    ValidationResult,
    AttributesAtEndOfCodeBlock,
    DuplicateParagraphId,
    InvalidParagraphId,
    DuplicateTaskId,
    MultipleAreasWithSameName,
    ZeroLengthArea,
    OverlappingClassedArea,
    AreaEndWithoutStart,
    DuplicateAreaEnd,
    AreaWithoutEnd,
)
from timApp.util.utils import count_chars_from_beginning


[docs]class DocReader:
    """
    :type lines: list[str]
    :type i: int
    :type current_line: int
    :param lines:
    :param i:
    """

    def __init__(self, lines, i=0):
        self.lines = lines
        self.current_line = i

[docs]    def peek_line(self):
        """

        :rtype: str
        :return:
        """
        return self.lines[self.current_line]

[docs]    def get_line_and_advance(self):
        result = self.peek_line()
        self.current_line += 1
        return result

[docs]    def has_more_lines(self):
        return self.current_line < len(self.lines)


[docs]class DocumentParser:
    """Splits documents into paragraphs.

    :type _blocks: list[dict]
    :type _doc_text: str
    :type _last_setting: tuple

    """

    def __init__(self, doc_text="", options: DocumentParserOptions | None = None):
        """

        :type doc_text: str
        """
        self._doc_text = doc_text
        self._blocks = None
        self._break_on_empty_line = False
        self._last_setting: DocumentParserOptions | None = None
        self.options: DocumentParserOptions = (
            options if options is not None else DocumentParserOptions()
        )

[docs]    def get_blocks(self):
        self._parse_document()
        return self._blocks

[docs]    def add_missing_attributes(
        self, hash_func=hashfunc, id_func=random_id, force_new_ids=False
    ):
        self._parse_document()
        for r in self._blocks:
            r["t"] = hash_func(r["md"], r["attrs"])
            if force_new_ids or not r.get("id"):
                r["id"] = id_func()
        return self

[docs]    def validate_structure(self) -> ValidationResult:
        self._parse_document()
        found_ids = set()
        found_tasks = set()
        found_areas = set()
        classed_areas = []
        found_area_ends = set()
        result = ValidationResult()
        for r in self._blocks:
            if r["type"] == "code":
                md = r["md"]
                try:
                    last_line = md[md.rindex("\n") + 1 :]
                    num_ticks = count_chars_from_beginning(md, "`")
                    if last_line.startswith("`" * num_ticks):
                        attrs, start_index = AttributeParser(last_line).get_attributes()
                        if start_index is not None:
                            result.add_issue(AttributesAtEndOfCodeBlock(r.get("id")))
                except ValueError:
                    pass
            curr_id = r.get("id")
            if curr_id is not None:
                if curr_id in found_ids:
                    result.add_issue(DuplicateParagraphId(curr_id))
                found_ids.add(curr_id)
                if not is_valid_id(curr_id):
                    result.add_issue(InvalidParagraphId(curr_id))
            attrs = r.get("attrs", {})
            task_id = attrs.get("taskId")
            if task_id:
                if task_id in found_tasks:
                    result.add_issue(DuplicateTaskId(curr_id, task_id))
                found_tasks.add(task_id)
            area = attrs.get("area")
            if area:
                if area in found_areas:
                    result.add_issue(MultipleAreasWithSameName(curr_id, area))
                has_classes = len(attrs.get("classes", [])) > 0
                if has_classes:
                    classed_areas.append(area)
                found_areas.add(area)
            area_end = attrs.get("area_end")
            if area_end:
                if area_end == area:
                    result.add_issue(ZeroLengthArea(curr_id, area))
                if area_end in classed_areas:
                    if area_end != classed_areas[-1]:
                        result.add_issue(
                            OverlappingClassedArea(curr_id, classed_areas[-1], area_end)
                        )
                    classed_areas.pop()
                if area_end not in found_areas:
                    result.add_issue(AreaEndWithoutStart(curr_id, area))
                if area_end in found_area_ends:
                    result.add_issue(DuplicateAreaEnd(curr_id, area))
                found_area_ends.add(area_end)
        unended_areas = found_areas - found_area_ends
        for a in unended_areas:
            result.add_issue(
                AreaWithoutEnd(None, a)
            )  # TODO get the par id of the start
        return result

    def _parse_document(self):
        if self._last_setting == self.options:
            return
        self._blocks = []
        options = self.options
        self._break_on_empty_line = options.break_on_empty_line
        self._last_setting = options
        lines = self._doc_text.splitlines()
        doc = DocReader(lines)
        funcs = [
            self.try_parse_code_block,
            self.try_parse_header_block,
            self.parse_normal_block,
        ]
        while True:
            self.eat_whitespace(doc)
            if not doc.has_more_lines():
                break
            for func in funcs:
                result = func(doc)
                if result:
                    result["md"] = result["md"].rstrip().strip("\r\n")
                    if (
                        (
                            (
                                result["type"] == "code"
                                and not options.break_on_code_block
                            )
                            or (
                                result["type"] == "header"
                                and not options.break_on_header
                            )
                            or (
                                result["type"] == "autonormal"
                                and not options.break_on_normal
                            )
                        )
                        and not result.get("attrs")
                        and len(self._blocks) > 0
                        and not self._blocks[-1].get("attrs", {}).get("plugin")
                        and self._blocks[-1]["type"] != "atom"
                    ):
                        self._blocks[-1]["md"] += "\n\n" + result["md"]
                    else:
                        if not result.get("attrs"):
                            result["attrs"] = {}
                        self._blocks.append(result)
                    break

[docs]    def is_beginning_of_code_block(self, doc):
        """

        :type doc: DocReader
        """
        if doc.peek_line().startswith("```"):
            code_start_char = "`"
        elif doc.peek_line().startswith("~~~"):
            code_start_char = "~"
        else:
            return False, None
        match = re.match("^" + code_start_char + "+", doc.peek_line()).group(0)
        return True, match

[docs]    def is_beginning_of_header_block(self, doc):
        return doc.peek_line().startswith("#")

[docs]    def is_empty_line(self, doc):
        """

        :type doc: DocReader
        """
        return doc.peek_line().isspace() or doc.peek_line() == ""

[docs]    def try_parse_code_block(self, doc):
        """

        :type doc: DocReader
        :rtype: dict
        """
        is_code_block, code_block_marker = self.is_beginning_of_code_block(doc)
        if not is_code_block:
            return None
        start_line = doc.get_line_and_advance()
        block_lines = []
        tokens, start = AttributeParser(start_line).get_attributes()
        is_atom = tokens.get("atom", False)
        if is_atom:
            tokens.pop("atom")
        else:
            first_line = start_line[:start].strip()
            block_lines.append(first_line)
        line = None
        while True:
            if not doc.has_more_lines():
                break
            line = doc.get_line_and_advance()
            if line.startswith(code_block_marker):
                break
            block_lines.append(line)
        if not is_atom and line is not None and line.startswith(code_block_marker):
            block_lines.append(line)
        elif line is None and not is_atom:
            # If the document ended abruptly, we insert the code block end marker automatically
            block_lines.append(code_block_marker)
        elif not is_atom:
            # Fill an incomplete code block end marker if needed.
            # For example, the paragraph
            #
            # ```
            # a
            # `
            #
            # becomes
            #
            # ```
            # a
            # ```
            #
            single_mark = code_block_marker[0]
            last_line_code_chars = count_chars_from_beginning(
                block_lines[-1], single_mark
            )
            if (last_line_code_chars > 0 or len(line) == 0) and len(
                line.strip()
            ) == last_line_code_chars:
                block_lines[-1] = (
                    single_mark * (len(code_block_marker) - last_line_code_chars)
                    + block_lines[-1]
                )
            else:
                block_lines.append(code_block_marker)

        result = {"md": "\n".join(block_lines), "type": "atom" if is_atom else "code"}
        self.extract_attrs(result, tokens)
        return result

[docs]    def try_parse_header_block(self, doc):
        """

        :rtype: dict
        :type doc: DocReader
        :param doc:
        :return:
        """
        if not self.is_beginning_of_header_block(doc):
            return None
        header_line = doc.get_line_and_advance()
        block_lines = []
        tokens, start = AttributeParser(header_line).get_attributes()
        block_type = "normal"
        if not header_line.startswith("#-"):
            block_type = "header"
            block_lines.append(header_line[:start].strip())
        block_lines.append(self.parse_normal_block(doc)["md"])
        result = {"md": "\n".join(block_lines), "type": block_type}
        self.extract_attrs(result, tokens)
        return result

[docs]    def parse_normal_block(self, doc):
        """

        :type doc: DocReader
        """
        block_lines = []
        while doc.has_more_lines():
            if (
                self.is_beginning_of_header_block(doc)
                or self.is_beginning_of_code_block(doc)[0]
                or (self._break_on_empty_line and self.is_empty_line(doc))
            ):
                break
            block_lines.append(doc.get_line_and_advance())
        return {"md": "\n".join(block_lines), "type": "autonormal"}

[docs]    def extract_attrs(self, result, tokens):
        for builtin in ("id", "t"):
            if builtin in tokens:
                result[builtin] = tokens.pop(builtin)
        if len(tokens) > 0:
            result["attrs"] = tokens

[docs]    def eat_whitespace(self, doc):
        """

        :rtype: NoneType
        :type doc: DocReader
        """
        while doc.has_more_lines() and self.is_empty_line(doc):
            doc.get_line_and_advance()
        return None
TIM documentation

Source code for timApp.document.documentparser