Source code for timApp.document.translation.translator

"""
This module contains most notably the TranslationService-interface that
different machine translators must implement in order to be integrated into
TIM's machine translation feature.

Other notable things include a database model for the API-keys of machine
translator services and a processor/wrapper by which the different
translators can be used to translate text from one language to another.
"""

__authors__ = [
    "Noora Jokela",
    "Riku Lehkonen",
    "Vili Moisala",
    "Juho Tarkkanen",
    "Sami Viitanen",
]
__license__ = "MIT"
__date__ = "25.4.2022"

from dataclasses import dataclass

import pypandoc

from timApp.timdb.sqa import db
from timApp.user.usergroup import UserGroup
from timApp.document.docparagraph import DocParagraph
from timApp.document.translation.language import Language
from timApp.document.translation.translationparser import (
    NoTranslate,
    TranslateApproval,
    TranslationParser,
    Table,
    Translate,
)
from timApp.util import logger
from timApp.util.flask.requesthelper import RouteException


TranslateBlock = list[TranslateApproval]
"""Typedef to represent logically connected parts of non- and translatable text.
"""


[docs]@dataclass
class Usage:
    """Contains information about the usage of a translator service."""

    character_count: int
    character_limit: int


[docs]@dataclass
class LanguagePairing:
    """Maps standardized codes of (source) Languages to lists of (target)
    Language objects.
    """

    value: dict[str, list[Language]]

    def __getitem__(self, item: str) -> list[Language]:
        """
        Implement the indexing operator [] on LanguagePairing.

        :param item: The key to index with.
        :return: The value corresponding to item.
        """
        return self.value[item]


[docs]class TranslationService(db.Model):
    """Represents the information and methods that must be available from all
    possible machine translators.
    """

    __tablename__ = "translationservice"

    id = db.Column(db.Integer, primary_key=True)
    """Translation service identifier."""

    service_name = db.Column(db.Text, unique=True, nullable=False)
    """Human-readable name of the machine translator. Also used as an
    identifier."""

[docs]    def translate(
        self,
        texts: list[TranslateBlock],
        source_lang: Language,
        target_lang: Language,
        *,
        tag_handling: str = "",
    ) -> list[str]:
        """
        Translate texts from source to target language.

        The implementor of this method should return the (translated) text in
        the same order as found in the input `texts`-parameter originally.

        :param texts: The texts marked for translation or not. A convention
         would be to pass as much of the translatable text as possible in this
         parameter in order to minimize the amount of separate
         translation-calls.
        :param source_lang: Language to translate from.
        :param target_lang: Language to translate into.
        :param tag_handling: Tag representing a way to separate or otherwise
         control translated text with the translation service. A HACKY way to
         handle special case with translating (html) tables.
        :return: List of strings found inside the items of `texts`-parameter,
         in the same order and translated.
        """

        raise NotImplementedError

[docs]    def usage(self) -> Usage:
        """
        Get the service's usage status.

        :return: The current usage of this TranslationService (for example
         status of an API-key).
        """
        raise NotImplementedError

[docs]    def languages(self) -> LanguagePairing:
        """
        Get the language-combinations for translations supported with the
        service.

        :return: The supported mapping of languages to translate to and from
         with this TranslationService.
        """
        raise NotImplementedError

[docs]    def supports(self, source_lang: Language, target_lang: Language) -> bool:
        """
        Check if the service supports a language-combination.

        :param source_lang: Language to translate from.
        :param target_lang: Language to translate into.
        :return: True, if the service can translate from `source_lang` to
         `target_lang`.
        """
        raise NotImplementedError

[docs]    def supports_tag_handling(self, tag_type: str) -> bool:
        """
        Check if the service supports tag handling in translations. For example
        using XML-tags, some services offer controlling parts of the text, that
        should be kept as-is and not be affected by the machine translation:
        "My name is Dr. <protect>Oak</protect>."

        NOTE this is related to the kinda HACKY way of handling Markdown-tables
        in DeepL-translation.

        :param tag_type: Type of the tag. Some services for example support
         "xml" or "html".
        :return: True, if the tag type is supported.
        """
        raise NotImplementedError

[docs]    def get_languages(self, source_langs: bool) -> list[Language]:
        """
        Return languages supported by the TranslationService.

        :param source_langs: Whether source languages must be returned.
        :return: The list of supported source or target languages.
        """
        raise NotImplementedError

    # Polymorphism allows querying multiple objects by their class e.g.
    # `TranslationService.query`.
    __mapper_args__ = {"polymorphic_on": service_name}


[docs]class TranslationServiceKey(db.Model):
    """Represents an API-key (or any string value) that is needed for using a
    machine translator and that one or more users are in possession of.
    """

    __tablename__ = "translationservicekey"

    id = db.Column(db.Integer, primary_key=True)
    """Key identifier."""

    # TODO Come up with a better name?
    api_key = db.Column(db.Text, nullable=False)
    """The key needed for using related service."""

    group_id = db.Column(db.Integer, db.ForeignKey("usergroup.id"), nullable=False)
    group: UserGroup = db.relationship("UserGroup", uselist=False)
    """The group that can use this key."""

    service_id = db.Column(
        db.Integer,
        db.ForeignKey("translationservice.id"),
        nullable=False,
    )
    service: TranslationService = db.relationship("TranslationService", uselist=False)
    """The service that this key is used in."""

[docs]    @staticmethod
    def get_by_user_group(
        user_group: UserGroup | None,
    ) -> "TranslationServiceKey":
        """
        Query a key based on a group that could have access to it.

        :param user_group: The group that wants to use a key.
        :return: The first matching TranslationServiceKey instance, if one is
         found.
        """
        return TranslationServiceKey.query.get(
            TranslationServiceKey.group_id == user_group
        )

[docs]    def to_json(self) -> dict:
        """
        Create a JSON representation of data related to the
        TranslationServiceKey instance.

        :return: The TranslationServiceKey instance's needed fields in a dict.
        """
        return {
            "translator": self.service.service_name,
            "APIkey": self.api_key,
        }


# PyCharm would otherwise want this class to implement the superclass methods.
# noinspection PyAbstractClass
[docs]class RegisteredTranslationService(TranslationService):
    """A translation service whose use is constrained by user group."""

[docs]    def register(self, user_group: UserGroup) -> None:
        """
        Set some state to the service object based on user group.

        :param user_group: The somehow related user group.
        :return: None.
        """
        raise NotImplementedError


[docs]@dataclass
class TranslationTarget:
    """Type that can be passed around in translations."""

    value: str | DocParagraph

[docs]    def get_text(self) -> str:
        if isinstance(self.value, str):
            return self.value
        elif isinstance(self.value, DocParagraph):
            return self.value.md
        else:
            raise Exception("Translation target had unexpected type")


[docs]class TranslateProcessor:
    def __init__(
        self,
        translator_code: str,
        s_lang: str,
        t_lang: str,
        user_group: UserGroup | None,
    ):
        """
        Based on a name, get the correct TranslationService from database and
        perform needed initializations on it.

        :param translator_code: Name that identifies the
         TranslationService being used.
        :param s_lang: Source language of translatable text.
        :param t_lang: Target language to translate text into.
        :param user_group: Identification of user, that can be allowed to use
         some TranslationServices (for example DeepL requires an API-key that
         the user sets to their account).
        """

        translator = (
            TranslationService.query.with_polymorphic("*")
            .filter(TranslationService.service_name == translator_code)
            .one()
        )

        if user_group is not None and isinstance(
            translator, RegisteredTranslationService
        ):
            translator.register(user_group)

        source_lang_ = Language.query_by_code(s_lang)
        target_lang_ = Language.query_by_code(t_lang)

        if not translator.supports(source_lang_, target_lang_):
            raise RouteException(
                description=f"The language pair from {source_lang_} to {target_lang_} is not supported with {translator.service_name}"
            )

        self.translator = translator
        self.parser = TranslationParser()
        self.source_lang = source_lang_
        self.target_lang = target_lang_

    def _translate_raw_texts(self, mds: list[str]) -> list[str]:
        """
        Most primitive of the translate-methods to translate texts between
        languages.

        :param mds: The texts to translate.
        :return: The translated texts in same order as input.
        """
        # Turn the text into lists of objects that describe whether they
        # can be translated or not.
        # TODO The flattening (calling `chain.from_iterable`) could
        #  probably be done in parser
        blocks: list[list[TranslateApproval]] = list(
            map(lambda x: self.parser.get_translate_approvals(x), mds)
        )

        # Map over blocks, picking the tables out for special translation
        # and handle the rest normally.
        for block in blocks:
            for i in range(len(block)):
                elem = block[i]
                if isinstance(elem, Table):
                    if self.translator.supports_tag_handling("html"):
                        # Special (HACKY) case, where md-tables are
                        # translated as html (if supported).
                        # TODO Actually implement table_collect at
                        #  translationparser.py so that non-html-handling
                        #  translators can be used as well
                        # Turn the markdown into html.
                        table_html: str = pypandoc.convert_text(
                            elem.text, to="html", format="md"
                        )
                        # Translate as HTML. NOTE Requires translator to
                        # support tag handling in HTML.
                        # TODO All document's tables could potentially be
                        #  send to translator at once instead of one by
                        #  one as done here.
                        table_html_tr = self.translator.translate(
                            [[Translate(table_html)]],
                            self.source_lang,
                            self.target_lang,
                            tag_handling="html",
                        )
                        # Turn the html back into md.
                        table_md_tr = pypandoc.convert_text(
                            table_html_tr[0], to="md", format="html"
                        )
                        # Now mark the table as NoTranslate, so it doesn't
                        # get translated when the list is passed on to
                        # mass-translation.
                        # TODO Adding this newline is kinda HACKY and not
                        #  thought out.
                        block[i] = NoTranslate("\n" + table_md_tr)
                    else:
                        # The table cannot be translated and is handled as
                        # is.
                        block[i] = NoTranslate(elem.text)

        # Pass object-lists with translatable text to the machine
        # translator object.
        # If supported, the translator protects and removes the protection
        # from the text (for example adding XML-ignore-tags in DeepL's
        # case).
        translated_mds = self.translator.translate(
            blocks, self.source_lang, self.target_lang
        )
        # TODO what are the paragraphs separated by? "\n\n"? Seems like
        #  this would need more handling in regard to TIM's block
        #  separation and id's etc.
        # TODO Do some MD-elements (from parser) not include newline
        #  postfix and should this newline-addition then be placed into
        #  parser-module?
        return translated_mds

    def _translate_paragraphs(self, targets: list[TranslationTarget]) -> list[str]:
        """
        Translate pieces of Markdown roughly the size of a generic
        paragraph.

        :param targets: The list of objects whose Markdown-text-value to
         parse.
        :return: List containing the translated pieces of markdown in same
         order as input.
        """

        mds = []

        for target in targets:
            md = target.get_text()
            if isinstance(target.value, DocParagraph) and target.value.is_plugin():
                # Add the attributes to the content so that parser can
                # identify the code block as a plugin.
                # NOTE that the parser should only use the attributes for
                # identification and deletes them from the translated
                # result ie. this is a special case!

                # Form the Pandoc abstract syntax tree -representation of a
                # code-block's Attr and glue the parts returned as is back
                # together into a string of Markdown.
                taskid = (
                    target.value.attrs.get("taskId", "") if target.value.attrs else ""
                )
                classes: list[str] = (
                    x
                    if target.value.attrs
                    and isinstance(x := target.value.attrs.get("classes"), list)
                    else []
                )
                kv_pairs = (
                    [(k, v) for k, v in target.value.attrs.items() if k != "taskId"]
                    if target.value.attrs
                    else []
                )
                attr_str = "".join(
                    map(
                        lambda y: y.text,
                        self.parser.attr_collect([taskid, classes, kv_pairs])[0],
                    )
                )
                md = md.replace("```\n", f"``` {attr_str}\n", 1)

            mds.append(md)

        try:
            return self._translate_raw_texts(mds)
        except Exception as e:
            raise RouteException("Automatic translation failed: " + str(e))

[docs]    def translate(self, pars: list[TranslationTarget]) -> list[str]:
        """
        Translate a list of text-containing items using the
        TranslationService-instance and languages set at initialization.

        :param pars: TIM-paragraphs containing Markdown to translate.
        :return: The translatable text contained in input paragraphs
         translated according to the processor-state (languages and the
         translator).
        """
        translated_texts = self._translate_paragraphs(pars)

        for i, part in enumerate(translated_texts):
            logger.log_debug(
                f"==== Part {i} ({len(part)} characters): ================================"
                f"{part}"
                "================================================"
            )

        usage = self.translator.usage()
        logger.log_debug(
            "Current usage: "
            + str(usage.character_count)
            + "/"
            + str(usage.character_limit)
        )

        return translated_texts


[docs]def replace_md_aliases(text: str) -> str:
    """
    Replace the aliases that are used in place of Markdown-syntax-characters.

    On some machine translators (tested with DeepL) the Markdown syntax
    characters break easier compared to their HTML-style counterparts. This is
    baked into the translation-parser, but must be converted back to
    Markdown-style in order to follow TIM's preferences.
    :param text: Text to replace the HTML-tags of.
    :return: Text with the HTML-tags replaced.
    """
    # TODO Map these replacements somehow from translationparser.py instead of
    #  hard-coding here (and there).
    return (
        text.replace("<i>", "*")
        .replace("</i>", "*")
        .replace("<b>", "**")
        .replace("</b>", "**")
    )
TIM documentation

Source code for timApp.document.translation.translator