Source code for timApp.document.editing.proofread

import json
from collections import defaultdict
from dataclasses import dataclass, field

import requests
from bs4 import BeautifulSoup
from bs4.element import PageElement, Comment, Tag

from timApp.document.prepared_par import PreparedPar


[docs]@dataclass class Word: word: str suggestions: list[str]
[docs]@dataclass class SpellCheckResult: words: list[Word] new_html: str
[docs]def proofread_pars(pars: list[PreparedPar]) -> list[SpellCheckResult]: return [process_spelling_errors(p.output) for p in pars]
banned_tags = {"code"} banned_classes = {"math", "mathp", "nospell"}
[docs]def is_banned(e: PageElement) -> bool: return isinstance(e, Comment) or any( p.name in banned_tags or (set(p.get("class", [])) & banned_classes) for p in e.parents )
[docs]@dataclass class VoikkoToken: tokenType: int tokenText: str
[docs]@dataclass class VoikkoClient: """Provides an API to libvoikko that exists in another container as an HTTP service. The API is intended to be identical to libvoikko. """ data: list[str] phrase_data: dict[str, list[VoikkoToken]] = field(init=False) spelling: dict[str, tuple[bool, list[str]]] = field(init=False) def __post_init__(self): r = requests.post("http://oiko:5000/api/v1/proofread", json=self.data) results = r.json() self.spelling = results["spelling"] self.phrase_data = dict( zip( self.data, [ [VoikkoToken(tokenText=t[0], tokenType=t[1]) for t in tokenlist] for tokenlist in results["tokenlists"] ], ) )
[docs] def tokens(self, s: str) -> list[VoikkoToken]: return self.phrase_data[s]
[docs] def spell(self, s: str) -> bool: return self.spelling[s][0]
[docs] def suggest(self, s: str) -> list[str]: return self.spelling[s][1]
[docs]def process_spelling_errors(s: str) -> SpellCheckResult: bs = BeautifulSoup(s, "lxml") words = [] word_occurrence_counts = defaultdict(int) text_elements = [e for e in bs.find_all(text=True) if not is_banned(e)] voikko = VoikkoClient([str(e) for e in text_elements]) for e in text_elements: # type: PageElement parts = [] has_errors = False for word in voikko.tokens(str(e)): # Token type 1 is word. if word.tokenType == 1 and not voikko.spell(word.tokenText): w = Word( word=word.tokenText, suggestions=voikko.suggest(word.tokenText) ) count = word_occurrence_counts[w.word] count += 1 word_occurrence_counts[w.word] = count words.append(w) se = bs.new_tag( "tim-spell-error", attrs={"bind-sugg": json.dumps(w.suggestions)} ) if count > 1: se["bind-count"] = count se.string = w.word parts.append(se) has_errors = True else: parts.append(word.tokenText) if has_errors: n = BeautifulSoup("", "lxml") for f in parts: n.append(f) e.replace_with(n) # Unwrap html and body tags. # Use copy of the list since bs4 will manipulate the original one for c in list(bs.contents): if isinstance(c, Tag) and c.name == "html": body = c.contents[0] if isinstance(body, Tag) and body.name == "body": c.replace_with(*body.contents) new_html = str(bs) return SpellCheckResult(words=words, new_html=new_html)