Source code for timApp.document.editing.proofread

import json
from collections import defaultdict
from dataclasses import dataclass, field

import requests
from bs4 import BeautifulSoup
from bs4.element import PageElement, Comment, Tag

from timApp.document.prepared_par import PreparedPar


[docs]@dataclass
class Word:
    word: str
    suggestions: list[str]


[docs]@dataclass
class SpellCheckResult:
    words: list[Word]
    new_html: str


[docs]def proofread_pars(pars: list[PreparedPar]) -> list[SpellCheckResult]:
    return [process_spelling_errors(p.output) for p in pars]


banned_tags = {"code"}
banned_classes = {"math", "mathp", "nospell"}


[docs]def is_banned(e: PageElement) -> bool:
    return isinstance(e, Comment) or any(
        p.name in banned_tags or (set(p.get("class", [])) & banned_classes)
        for p in e.parents
    )


[docs]@dataclass
class VoikkoToken:
    tokenType: int
    tokenText: str


[docs]@dataclass
class VoikkoClient:
    """Provides an API to libvoikko that exists in another container as an HTTP service.
    The API is intended to be identical to libvoikko.
    """

    data: list[str]
    phrase_data: dict[str, list[VoikkoToken]] = field(init=False)
    spelling: dict[str, tuple[bool, list[str]]] = field(init=False)

    def __post_init__(self):
        r = requests.post("http://oiko:5000/api/v1/proofread", json=self.data)
        results = r.json()
        self.spelling = results["spelling"]
        self.phrase_data = dict(
            zip(
                self.data,
                [
                    [VoikkoToken(tokenText=t[0], tokenType=t[1]) for t in tokenlist]
                    for tokenlist in results["tokenlists"]
                ],
            )
        )

[docs]    def tokens(self, s: str) -> list[VoikkoToken]:
        return self.phrase_data[s]

[docs]    def spell(self, s: str) -> bool:
        return self.spelling[s][0]

[docs]    def suggest(self, s: str) -> list[str]:
        return self.spelling[s][1]


[docs]def process_spelling_errors(s: str) -> SpellCheckResult:
    bs = BeautifulSoup(s, "lxml")
    words = []
    word_occurrence_counts = defaultdict(int)
    text_elements = [e for e in bs.find_all(text=True) if not is_banned(e)]
    voikko = VoikkoClient([str(e) for e in text_elements])

    for e in text_elements:  # type: PageElement
        parts = []
        has_errors = False
        for word in voikko.tokens(str(e)):
            # Token type 1 is word.
            if word.tokenType == 1 and not voikko.spell(word.tokenText):
                w = Word(
                    word=word.tokenText, suggestions=voikko.suggest(word.tokenText)
                )
                count = word_occurrence_counts[w.word]
                count += 1
                word_occurrence_counts[w.word] = count
                words.append(w)
                se = bs.new_tag(
                    "tim-spell-error", attrs={"bind-sugg": json.dumps(w.suggestions)}
                )
                if count > 1:
                    se["bind-count"] = count
                se.string = w.word
                parts.append(se)
                has_errors = True
            else:
                parts.append(word.tokenText)
        if has_errors:
            n = BeautifulSoup("", "lxml")
            for f in parts:
                n.append(f)
            e.replace_with(n)

    # Unwrap html and body tags.
    # Use copy of the list since bs4 will manipulate the original one
    for c in list(bs.contents):
        if isinstance(c, Tag) and c.name == "html":
            body = c.contents[0]
            if isinstance(body, Tag) and body.name == "body":
                c.replace_with(*body.contents)

    new_html = str(bs)

    return SpellCheckResult(words=words, new_html=new_html)
TIM documentation

Source code for timApp.document.editing.proofread