Source code for timApp.document.translation.translator

"""
This module contains most notably the TranslationService-interface that
different machine translators must implement in order to be integrated into
TIM's machine translation feature.

Other notable things include a database model for the API-keys of machine
translator services and a processor/wrapper by which the different
translators can be used to translate text from one language to another.
"""

__authors__ = [
    "Noora Jokela",
    "Riku Lehkonen",
    "Vili Moisala",
    "Juho Tarkkanen",
    "Sami Viitanen",
]
__license__ = "MIT"
__date__ = "25.4.2022"

from dataclasses import dataclass

import pypandoc

from timApp.timdb.sqa import db
from timApp.user.usergroup import UserGroup
from timApp.document.docparagraph import DocParagraph
from timApp.document.translation.language import Language
from timApp.document.translation.translationparser import (
    NoTranslate,
    TranslateApproval,
    TranslationParser,
    Table,
    Translate,
)
from timApp.util import logger
from timApp.util.flask.requesthelper import RouteException


TranslateBlock = list[TranslateApproval]
"""Typedef to represent logically connected parts of non- and translatable text.
"""


[docs]@dataclass class Usage: """Contains information about the usage of a translator service.""" character_count: int character_limit: int
[docs]@dataclass class LanguagePairing: """Maps standardized codes of (source) Languages to lists of (target) Language objects. """ value: dict[str, list[Language]] def __getitem__(self, item: str) -> list[Language]: """ Implement the indexing operator [] on LanguagePairing. :param item: The key to index with. :return: The value corresponding to item. """ return self.value[item]
[docs]class TranslationService(db.Model): """Represents the information and methods that must be available from all possible machine translators. """ __tablename__ = "translationservice" id = db.Column(db.Integer, primary_key=True) """Translation service identifier.""" service_name = db.Column(db.Text, unique=True, nullable=False) """Human-readable name of the machine translator. Also used as an identifier."""
[docs] def translate( self, texts: list[TranslateBlock], source_lang: Language, target_lang: Language, *, tag_handling: str = "", ) -> list[str]: """ Translate texts from source to target language. The implementor of this method should return the (translated) text in the same order as found in the input `texts`-parameter originally. :param texts: The texts marked for translation or not. A convention would be to pass as much of the translatable text as possible in this parameter in order to minimize the amount of separate translation-calls. :param source_lang: Language to translate from. :param target_lang: Language to translate into. :param tag_handling: Tag representing a way to separate or otherwise control translated text with the translation service. A HACKY way to handle special case with translating (html) tables. :return: List of strings found inside the items of `texts`-parameter, in the same order and translated. """ raise NotImplementedError
[docs] def usage(self) -> Usage: """ Get the service's usage status. :return: The current usage of this TranslationService (for example status of an API-key). """ raise NotImplementedError
[docs] def languages(self) -> LanguagePairing: """ Get the language-combinations for translations supported with the service. :return: The supported mapping of languages to translate to and from with this TranslationService. """ raise NotImplementedError
[docs] def supports(self, source_lang: Language, target_lang: Language) -> bool: """ Check if the service supports a language-combination. :param source_lang: Language to translate from. :param target_lang: Language to translate into. :return: True, if the service can translate from `source_lang` to `target_lang`. """ raise NotImplementedError
[docs] def supports_tag_handling(self, tag_type: str) -> bool: """ Check if the service supports tag handling in translations. For example using XML-tags, some services offer controlling parts of the text, that should be kept as-is and not be affected by the machine translation: "My name is Dr. <protect>Oak</protect>." NOTE this is related to the kinda HACKY way of handling Markdown-tables in DeepL-translation. :param tag_type: Type of the tag. Some services for example support "xml" or "html". :return: True, if the tag type is supported. """ raise NotImplementedError
[docs] def get_languages(self, source_langs: bool) -> list[Language]: """ Return languages supported by the TranslationService. :param source_langs: Whether source languages must be returned. :return: The list of supported source or target languages. """ raise NotImplementedError
# Polymorphism allows querying multiple objects by their class e.g. # `TranslationService.query`. __mapper_args__ = {"polymorphic_on": service_name}
[docs]class TranslationServiceKey(db.Model): """Represents an API-key (or any string value) that is needed for using a machine translator and that one or more users are in possession of. """ __tablename__ = "translationservicekey" id = db.Column(db.Integer, primary_key=True) """Key identifier.""" # TODO Come up with a better name? api_key = db.Column(db.Text, nullable=False) """The key needed for using related service.""" group_id = db.Column(db.Integer, db.ForeignKey("usergroup.id"), nullable=False) group: UserGroup = db.relationship("UserGroup", uselist=False) """The group that can use this key.""" service_id = db.Column( db.Integer, db.ForeignKey("translationservice.id"), nullable=False, ) service: TranslationService = db.relationship("TranslationService", uselist=False) """The service that this key is used in."""
[docs] @staticmethod def get_by_user_group( user_group: UserGroup | None, ) -> "TranslationServiceKey": """ Query a key based on a group that could have access to it. :param user_group: The group that wants to use a key. :return: The first matching TranslationServiceKey instance, if one is found. """ return TranslationServiceKey.query.get( TranslationServiceKey.group_id == user_group )
[docs] def to_json(self) -> dict: """ Create a JSON representation of data related to the TranslationServiceKey instance. :return: The TranslationServiceKey instance's needed fields in a dict. """ return { "translator": self.service.service_name, "APIkey": self.api_key, }
# PyCharm would otherwise want this class to implement the superclass methods. # noinspection PyAbstractClass
[docs]class RegisteredTranslationService(TranslationService): """A translation service whose use is constrained by user group."""
[docs] def register(self, user_group: UserGroup) -> None: """ Set some state to the service object based on user group. :param user_group: The somehow related user group. :return: None. """ raise NotImplementedError
[docs]@dataclass class TranslationTarget: """Type that can be passed around in translations.""" value: str | DocParagraph
[docs] def get_text(self) -> str: if isinstance(self.value, str): return self.value elif isinstance(self.value, DocParagraph): return self.value.md else: raise Exception("Translation target had unexpected type")
[docs]class TranslateProcessor: def __init__( self, translator_code: str, s_lang: str, t_lang: str, user_group: UserGroup | None, ): """ Based on a name, get the correct TranslationService from database and perform needed initializations on it. :param translator_code: Name that identifies the TranslationService being used. :param s_lang: Source language of translatable text. :param t_lang: Target language to translate text into. :param user_group: Identification of user, that can be allowed to use some TranslationServices (for example DeepL requires an API-key that the user sets to their account). """ translator = ( TranslationService.query.with_polymorphic("*") .filter(TranslationService.service_name == translator_code) .one() ) if user_group is not None and isinstance( translator, RegisteredTranslationService ): translator.register(user_group) source_lang_ = Language.query_by_code(s_lang) target_lang_ = Language.query_by_code(t_lang) if not translator.supports(source_lang_, target_lang_): raise RouteException( description=f"The language pair from {source_lang_} to {target_lang_} is not supported with {translator.service_name}" ) self.translator = translator self.parser = TranslationParser() self.source_lang = source_lang_ self.target_lang = target_lang_ def _translate_raw_texts(self, mds: list[str]) -> list[str]: """ Most primitive of the translate-methods to translate texts between languages. :param mds: The texts to translate. :return: The translated texts in same order as input. """ # Turn the text into lists of objects that describe whether they # can be translated or not. # TODO The flattening (calling `chain.from_iterable`) could # probably be done in parser blocks: list[list[TranslateApproval]] = list( map(lambda x: self.parser.get_translate_approvals(x), mds) ) # Map over blocks, picking the tables out for special translation # and handle the rest normally. for block in blocks: for i in range(len(block)): elem = block[i] if isinstance(elem, Table): if self.translator.supports_tag_handling("html"): # Special (HACKY) case, where md-tables are # translated as html (if supported). # TODO Actually implement table_collect at # translationparser.py so that non-html-handling # translators can be used as well # Turn the markdown into html. table_html: str = pypandoc.convert_text( elem.text, to="html", format="md" ) # Translate as HTML. NOTE Requires translator to # support tag handling in HTML. # TODO All document's tables could potentially be # send to translator at once instead of one by # one as done here. table_html_tr = self.translator.translate( [[Translate(table_html)]], self.source_lang, self.target_lang, tag_handling="html", ) # Turn the html back into md. table_md_tr = pypandoc.convert_text( table_html_tr[0], to="md", format="html" ) # Now mark the table as NoTranslate, so it doesn't # get translated when the list is passed on to # mass-translation. # TODO Adding this newline is kinda HACKY and not # thought out. block[i] = NoTranslate("\n" + table_md_tr) else: # The table cannot be translated and is handled as # is. block[i] = NoTranslate(elem.text) # Pass object-lists with translatable text to the machine # translator object. # If supported, the translator protects and removes the protection # from the text (for example adding XML-ignore-tags in DeepL's # case). translated_mds = self.translator.translate( blocks, self.source_lang, self.target_lang ) # TODO what are the paragraphs separated by? "\n\n"? Seems like # this would need more handling in regard to TIM's block # separation and id's etc. # TODO Do some MD-elements (from parser) not include newline # postfix and should this newline-addition then be placed into # parser-module? return translated_mds def _translate_paragraphs(self, targets: list[TranslationTarget]) -> list[str]: """ Translate pieces of Markdown roughly the size of a generic paragraph. :param targets: The list of objects whose Markdown-text-value to parse. :return: List containing the translated pieces of markdown in same order as input. """ mds = [] for target in targets: md = target.get_text() if isinstance(target.value, DocParagraph) and target.value.is_plugin(): # Add the attributes to the content so that parser can # identify the code block as a plugin. # NOTE that the parser should only use the attributes for # identification and deletes them from the translated # result ie. this is a special case! # Form the Pandoc abstract syntax tree -representation of a # code-block's Attr and glue the parts returned as is back # together into a string of Markdown. taskid = ( target.value.attrs.get("taskId", "") if target.value.attrs else "" ) classes: list[str] = ( x if target.value.attrs and isinstance(x := target.value.attrs.get("classes"), list) else [] ) kv_pairs = ( [(k, v) for k, v in target.value.attrs.items() if k != "taskId"] if target.value.attrs else [] ) attr_str = "".join( map( lambda y: y.text, self.parser.attr_collect([taskid, classes, kv_pairs])[0], ) ) md = md.replace("```\n", f"``` {attr_str}\n", 1) mds.append(md) try: return self._translate_raw_texts(mds) except Exception as e: raise RouteException("Automatic translation failed: " + str(e))
[docs] def translate(self, pars: list[TranslationTarget]) -> list[str]: """ Translate a list of text-containing items using the TranslationService-instance and languages set at initialization. :param pars: TIM-paragraphs containing Markdown to translate. :return: The translatable text contained in input paragraphs translated according to the processor-state (languages and the translator). """ translated_texts = self._translate_paragraphs(pars) for i, part in enumerate(translated_texts): logger.log_debug( f"==== Part {i} ({len(part)} characters): ================================" f"{part}" "================================================" ) usage = self.translator.usage() logger.log_debug( "Current usage: " + str(usage.character_count) + "/" + str(usage.character_limit) ) return translated_texts
[docs]def replace_md_aliases(text: str) -> str: """ Replace the aliases that are used in place of Markdown-syntax-characters. On some machine translators (tested with DeepL) the Markdown syntax characters break easier compared to their HTML-style counterparts. This is baked into the translation-parser, but must be converted back to Markdown-style in order to follow TIM's preferences. :param text: Text to replace the HTML-tags of. :return: Text with the HTML-tags replaced. """ # TODO Map these replacements somehow from translationparser.py instead of # hard-coding here (and there). return ( text.replace("<i>", "*") .replace("</i>", "*") .replace("<b>", "**") .replace("</b>", "**") )