"""
Contains implementation of the TranslationService-interface for the DeepL
machine translator: https://www.deepl.com/translator.
Both DeepL API Free and DeepL API Pro -versions.
"""
__authors__ = [
"Noora Jokela",
"Riku Lehkonen",
"Vili Moisala",
"Juho Tarkkanen",
"Sami Viitanen",
]
__license__ = "MIT"
__date__ = "25.4.2022"
import langcodes
from requests import post, Response
from requests.exceptions import JSONDecodeError
from timApp.document.translation.language import Language
from timApp.document.translation.translationparser import TranslateApproval, NoTranslate
from timApp.document.translation.translator import (
RegisteredTranslationService,
TranslationServiceKey,
TranslateBlock,
Usage,
LanguagePairing,
replace_md_aliases,
)
from timApp.timdb.sqa import db
from timApp.user.usergroup import UserGroup
from timApp.util import logger
from timApp.util.flask.requesthelper import NotExist, RouteException
from timApp.util.flask.cache import cache
from tim_common.vendor.requests_futures import FuturesSession, Future
LANGUAGES_CACHE_TIMEOUT = 3600 * 24 # seconds
[docs]class DeeplTranslationService(RegisteredTranslationService):
"""Translation service using the DeepL API Free."""
def __init__(self, values: dict):
"""
Constructor for setting all the needed DeepL-specific fields.
:param values: Contains values for the fields.
"""
self.ignore_tag = values["ignore_tag"]
self.service_url = values["service_url"]
# TODO Would be better as nullable=False, but that prevents creating
# non-DeeplTranslationService -subclasses of TranslationService.
service_url = db.Column(db.Text)
"""The url base for the API calls."""
# TODO Would be better as nullable=False, but that prevents creating
# non-DeeplTranslationService -subclasses of TranslationService.
ignore_tag = db.Column(db.Text)
"""The XML-tag name to use for ignoring pieces of text when XML-handling is
used. Should be chosen to be some uncommon string not found in many texts.
"""
headers: dict[str, str]
"""Request-headers needed for authentication with the API-key."""
source_Language_code: str
"""The source language's code (helps handling regional variants that DeepL
doesn't differentiate).
"""
[docs] def register(self, user_group: UserGroup) -> None:
"""
Set headers to use the user group's API-key ready for translation
calls.
:param user_group: The user group whose API key will be used.
:raises NotExist: If no API key is found.
:raises RouteException: If more than one key is found from user.
"""
# One user group should match one service per one key.
api_key = TranslationServiceKey.query.filter(
TranslationServiceKey.service_id == self.id,
TranslationServiceKey.group_id == user_group.id,
).all()
if len(api_key) == 0:
raise NotExist(
"Please add a DeepL API key that corresponds the chosen plan into your account"
)
if len(api_key) > 1:
# TODO Does revealing this info compromise security in any way?
raise RouteException(
"A user should not have more than one (1) API-key per service."
)
self.headers = {"Authorization": f"DeepL-Auth-Key {api_key[0].api_key}"}
# TODO Change the dicts to DeepLTranslateParams and DeeplResponse or smth
def _post(self, url_slug: str, data: dict | None = None) -> dict:
"""
Perform an authorized POST-request to the DeepL-API.
:param url_slug: The last part of URL-path for the API function without
the starting '/' slash.
:param data: Data to be transmitted along the request.
:return: The JSON-response returned by the API.
"""
resp = post(self.service_url + "/" + url_slug, data=data, headers=self.headers)
return self._handle_post_response(resp)
def _handle_post_response(self, resp: Response) -> dict:
"""
Handle converting successful response into JSON or raise an exception
with a fitting message.
:param resp: The DeepL API -response to handle.
:return: The JSON-response returned by the API.
:raises RouteException: If DeepL API returned an error.
:raises Exception: If DeepL API returned an unknown or unexpected error.
"""
if resp.ok:
try:
return resp.json()
except JSONDecodeError as e:
raise Exception(f"DeepL API returned malformed JSON: {e}")
else:
status_code = resp.status_code
# Handle the status codes given by DeepL API
# Using Python 3.10's match-statement would be cool here...
if status_code == 400:
debug_exception = Exception(
f"The request to the DeepL API was bad. Please check your parameters."
)
elif status_code == 403:
debug_exception = Exception(
f"Authorization failed. Please check your DeepL API key for typos."
)
elif status_code == 404:
debug_exception = Exception(
f"The requested translator could not be found. Please try again later."
)
elif status_code == 413:
debug_exception = Exception(
f"The request size exceeds the API's limit. Please try again with a smaller document."
)
elif status_code == 414:
debug_exception = Exception(
f"The request URL is too long. Please contact TIM support."
)
elif status_code == 429:
debug_exception = Exception(
f"Too many requests were sent. Please wait and resend the request later."
)
elif status_code == 456:
debug_exception = Exception(
f"You have exceeded your character quota. Please try again when your quota has reset."
)
elif status_code == 503:
debug_exception = Exception(
f"Translator currently unavailable. Please try again later."
)
elif status_code == 529:
debug_exception = Exception(
f"Too many requests were sent. Please wait and resend the request later."
)
elif 500 <= status_code < 600:
debug_exception = Exception(
f"An internal error occurred on the DeepL server. Please try again."
)
else:
# TODO Do not show this to user. Confirm, that wuff is sent.
debug_exception = Exception(
f"'{resp.url}' responded with: {status_code}"
)
raise RouteException(
description="The request failed. Error message: " + str(debug_exception)
)
def _translate(
self,
session: FuturesSession,
text: list[str],
source_lang: str | None,
target_lang: str,
*,
split_sentences: str | None = None,
preserve_formatting: str | None = None,
tag_handling: str | None = None,
non_splitting_tags: list[str] = [],
splitting_tags: list[str] = [],
ignore_tags: list[str] = [],
) -> Future:
"""
Supports most of the parameters of a DeepL API translate call.
See https://www.deepl.com/docs-api/translating-text/request/ for valid
parameter values and more information.
With tag handling for example to handle the tag "<x>" the parameter
value should be "x".
:param session: Object to use in constructing the single DeepL API
translate-call.
:param text: Text to translate that can contain XML.
:param source_lang: Language of the text.
:param target_lang: Language to translate the text into.
:param split_sentences: Is text split before translation.
:param preserve_formatting: Is formatting preserved during translation.
:param tag_handling: Are tags intelligently handled. XML and HTML are
currently supported.
:param non_splitting_tags: Tags that never split sentences.
:param splitting_tags: Tags that always split sentences.
:param ignore_tags: Tags to ignore when translating.
:return: A Future-object of the DeepL API translate-call.
"""
src_lang = source_lang
if source_lang is not None and (
source_lang.lower() == "en-gb" or source_lang.lower() == "en-us"
):
src_lang = "en"
logger.log_debug(f"Amount of separate translatable texts: {str(len(text))}/50")
data = {
"text": text,
"source_lang": src_lang,
"target_lang": target_lang,
"split_sentences": split_sentences,
"preserve_formatting": preserve_formatting,
"tag_handling": tag_handling,
"non_splitting_tags": ",".join(non_splitting_tags),
"splitting_tags": ",".join(splitting_tags),
"ignore_tags": ",".join(ignore_tags),
}
return session.post(
self.service_url + "/translate", data=data, headers=self.headers
)
@cache.memoize(timeout=LANGUAGES_CACHE_TIMEOUT, args_to_ignore=["self"])
def _languages(self, *, is_source: bool) -> dict:
"""
Get languages supported by the API.
:param is_source: Flag to query for supported source-languages.
:return: Languages supported in translations by type (source or
target).
"""
return self._post(
"languages", data={"type": "source" if is_source else "target"}
)
[docs] def preprocess(self, elem: TranslateApproval) -> None:
"""
Protect the text inside element from mangling in translation by adding
XML-tags.
:param elem: The element to add XML-protection-tags to.
:return None. The tag is added to the input object.
"""
# TODO If the protection tag is found in the content text, somehow
# encode such tag first.
if type(elem) is NoTranslate:
elem.text = f"<{self.ignore_tag}>{elem.text}</{self.ignore_tag}>"
[docs] def postprocess(self, text: str) -> str:
"""
Remove unnecessary protection tags from the text and change defined
aliases back to Markdown syntax.
:param text: The text returned from DeepL API after translation.
:return: Text with the needed operations performed to more closely
match the text before passing it to DeepL API.
"""
return (
replace_md_aliases(text)
.replace(f"<{self.ignore_tag}>", "")
.replace(f"</{self.ignore_tag}>", "")
)
[docs] def translate(
self,
texts: list[TranslateBlock],
source_lang: Language | None,
target_lang: Language,
tag_handling: str = "xml",
) -> list[str]:
"""
Use the DeepL API to translate text between languages.
:param texts: Some set of texts to be translated.
:param source_lang: Language of input text. None value makes DeepL
guess it from the text.
:param target_lang: Language for target language.
:param tag_handling: See comment in superclass.
:return: List of strings in target language with the non-translatable
parts intact.
"""
source_lang_code = source_lang.lang_code if source_lang else None
# Get the translatable text of objects and add XML-tag -protection to
# them if so needed.
if tag_handling == "xml":
# TODO This multidimensionalism of lists is hard to read
for block in texts:
for elem in block:
self.preprocess(elem)
# TODO This multidimensionalism of lists is hard to read
# Combine the strings of each block for maximum-effectiveness of the
# translation-call.
protected_texts = list(
map(lambda xs: "".join(map(lambda x: x.text, xs)), texts)
)
# Translate texts 50 at a time to match DeepL-spec:
# "Up to 50 text parameters can be submitted in one request."
# https://www.deepl.com/docs-api/translating-text/large-volumes/
translate_calls = list()
# Initialize the session for parallel translate-calls.
session = FuturesSession()
for i in range(0, len(protected_texts), 50):
call = self._translate(
session,
protected_texts[i : i + 50],
# Send uppercase, because it is used in DeepL documentation.
source_lang_code.upper(),
target_lang.lang_code.upper(),
# "1" (for example) keeps original document's empty newlines.
split_sentences="1",
# NOTE preserve_formatting=1 might remove punctuation even
# though DeepL should not make guesses of the content.
preserve_formatting="0",
tag_handling=tag_handling,
ignore_tags=[self.ignore_tag],
)
translate_calls.append(call)
# Wait for the parallel calls to finish and get their results in
# order.
translation_resps = list()
for call in translate_calls:
resp = call.result()
# TODO Handle exceptions raised in the error handling.
resp_json = self._handle_post_response(resp)
translation_resps += resp_json["translations"]
# Insert the text-parts sent to the API into correct places in
# original elements.
translated_texts = list()
for resp in translation_resps:
clean_block = (
self.postprocess(resp["text"])
if tag_handling == "xml"
else resp["text"]
)
translated_texts.append(clean_block)
return translated_texts
[docs] def usage(self) -> Usage:
"""
Fetch current API usage of the registered key from DeepL.
:return: Usage returned from DeepL.
"""
resp_json = self._post("usage")
return Usage(
character_count=int(resp_json["character_count"]),
character_limit=int(resp_json["character_limit"]),
)
[docs] def get_languages(self, source_langs: bool) -> list[Language]:
"""
Fetches the source or target languages from DeepL.
:param source_langs: Whether source languages must be fetched
:return: The list of source of target languages from DeepL.
"""
def get_langs_from_db(deepl_lang: dict) -> Language | None:
try:
language = deepl_lang["language"]
code = langcodes.get(language).to_tag()
# This is needed because DeepL's source languages only include
# English (EN) and not regional variants.
if code.lower() == "en":
code = self.source_Language_code
return Language.query_by_code(code)
except LookupError:
return None
self.source_Language_code = "en-GB"
langs = self._languages(is_source=source_langs)
return_langs = list(filter(None, map(get_langs_from_db, langs)))
if source_langs:
self.source_Language_code = "en-US"
en: Language | None = Language(
lang_code="",
lang_name="",
autonym="",
)
for lang in langs:
if lang.get("language").lower() == "en":
en = get_langs_from_db(lang)
if en is not None:
return_langs = return_langs + [en]
return return_langs
[docs] @cache.memoize(timeout=LANGUAGES_CACHE_TIMEOUT, args_to_ignore=["self"])
def languages(self) -> LanguagePairing:
"""
Asks the DeepL API for the list of supported languages and turns the
returned language codes to Languages found in the database.
:return: Dictionary of source langs to lists of target langs, that are
supported by the API and also found in database.
"""
def get_lang(deepl_lang: dict) -> Language | None:
try:
language = deepl_lang["language"]
code = langcodes.get(language).to_tag()
# This is needed because DeepL's source languages only include
# English (EN) and not regional variants.
if code.lower() == "en":
code = self.source_Language_code
return Language.query_by_code(code)
except LookupError:
return None
# Query API for supported source and target languages and transform
# them into the return type.
resp_json_src = self._languages(is_source=True)
resp_json_target = self._languages(is_source=False)
db_langs_src: list[Language] = list(filter(None, map(get_lang, resp_json_src)))
db_langs_target: list[Language] = list(
filter(None, map(get_lang, resp_json_target))
)
langs_map = {lang.lang_code: db_langs_target for lang in db_langs_src}
return LanguagePairing(langs_map)
[docs] def supports(self, source_lang: Language, target_lang: Language) -> bool:
"""
Check that the source language can be translated into target language
by the translation API.
:param source_lang: Language to check the translation capability from.
:param target_lang: Language to check the translation capability into.
:return: True, if the pairing is supported.
"""
self.source_Language_code = source_lang.lang_code
try:
supported_languages: list[Language] = self.languages()[
self.source_Language_code
]
except KeyError as e:
raise RouteException(
f"The language code {e} was not found in supported source languages."
)
# The target language is found by the primary key.
# TODO is this too much? Can't strings be just as good?
# Maybe better would be to handle Languages by their database id's?
return any(x.lang_code == target_lang.lang_code for x in supported_languages)
[docs] def supports_tag_handling(self, tag_type: str) -> bool:
"""
Check if DeeplTranslationService supports a tag-handling.
:param tag_type: The tag-type to check handling for.
:return: True if the tag-type is supported.
"""
return tag_type in ["xml", "html"]
# TODO Make the value an enum like with Verification?
__mapper_args__ = {"polymorphic_identity": "DeepL Free"}
[docs]class DeeplProTranslationService(DeeplTranslationService):
"""Translation service using the DeepL API Pro."""
# TODO Make the value an enum like with Verification?
__mapper_args__ = {"polymorphic_identity": "DeepL Pro"}