Source code for timApp.markdown.markdownconverter

"""Provides functions for converting markdown-formatted text to HTML."""
from __future__ import annotations

from dataclasses import dataclass
from datetime import date, datetime, timedelta
from typing import TYPE_CHECKING, Iterable, Any
from urllib.parse import quote_plus

from jinja2 import TemplateSyntaxError
from lxml import html, etree
from sqlalchemy.orm import load_only, lazyload

from timApp.document.usercontext import UserContext
from timApp.document.viewcontext import ViewContext, default_view_ctx
from timApp.document.yamlblock import YamlBlock
from timApp.markdown.autocounters import (
    AutoCounters,
    TimSandboxedEnvironment,
    HEADING_TAGS,
    add_h_values,
    check_autonumber_error,
)
from timApp.markdown.dumboclient import call_dumbo
from timApp.util.utils import get_error_html, title_to_id
from timApp.util.utils import widen_fields
from tim_common.html_sanitize import sanitize_html, presanitize_html_body

if TYPE_CHECKING:
    from timApp.document.docparagraph import DocParagraph
    from timApp.document.docsettings import DocSettings
    from timApp.document.document import Document


[docs]def has_macros(text: str, env: TimSandboxedEnvironment): return ( env.variable_start_string in text or env.comment_start_string in text or env.block_start_string in text )
# ------------------------ Jinja filters ------------------------------------------------------------------- # Ks. https://tim.jyu.fi/view/tim/ohjeita/satunnaistus#timfiltterit # To create a new filter, # 1. make a class or function # 2. and map it in create_environment
[docs]def genfields(flds, attrs="", stemfield="stem"): """ Generates fields from namelist like ['se1', 'd1', 'd2=demo2'] See usescases from: /tim/timApp/tests/server/test_genfields.py :param flds: list of fields, maybe with aliases to show in stem :param attrs: possible list of attributes :param stemfield: field to use to show filed ste, like sten, header or inputstem :return: TIM-format of fields """ flds = widen_fields(flds) res = "" if attrs: attrs = ", " + attrs for fld in flds: parts = fld.split("=") fid = parts[0].strip() if len(parts) > 1: text = parts[1].strip() else: parts = fid.split(":") text = parts[0] s = f"{{#{fid} {stemfield}: '{text}'{attrs}#}}" res += s return res
[docs]def gfrange(s, i1, i2, attrs="", stemfield="stem"): flds = s.split(";", 1) s = flds[0] srest = "" if len(flds) > 1: srest = flds[1] # parts = s.split("=") # name = f"{parts[0]}({i1},{i2})" i = len(s) ic = s.find(":") # a:b => a{0}:b ie = s.find("=") # a=b => a{0}=b{0} and a:b=c => a{0}:b=b{0} if ic >= 0: i = ic if ie >= 0 and (ie < ic or ic < 0): i = ie if i < 1: return "" s1 = s[:i] s2 = s[i:] f1 = "" f2 = "" if s1.find("{") < 0: f1 = "{0}" if s2.find("{") < 0: f2 = "{0}" s = s1 + f1 + s2 if ie >= 0: s += f2 s += ";" step = 1 if i1 > i2: step = -1 s = srange(s, i1, i2, step) return genfields(s + srest, attrs, stemfield)
[docs]def srange(s, i1, i2, step=1, *argv): """ Jinja2 filter for generating indexed names :param s: format string for item :param i1: start index :param i2: exclusive end index :param step: how much increment :param argv pair of value to add and mul index :return: like "d1 d2 d3 " by call sfrom('d{0} ', 1, 3) """ result = "" cor = 1 # correct python range vs normal people range if step < 0: cor = -1 if step == 0: step = 1 for i in range(i1, i2 + cor, step): ext = [] for j in range(0, len(argv), 2): add = argv[j] mul = 1 if j + 1 < len(argv): mul = argv[j + 1] ext.append(mul * i + add) result += s.format(i, *ext) return result
# noinspection PyPep8Naming
[docs]def Pz(i): """ Returns number as a string so that from 0 comes "", postive number comes like " + 1" and negative comes like " - 1" :param i: number to convert :return: number as a string suitable for expressions """ if i > 0: return " + " + str(i) if i < 0: return " - " + str(-i) return ""
[docs]@dataclass class Belongs: user_ctx: UserContext def __post_init__(self): self.cache = {}
[docs] def belongs_to_group(self, groupname: str): b = self.cache.get(groupname, None) if b is not None: return b b = any(gr.name == groupname for gr in self.user_ctx.logged_user.groups) self.cache[groupname] = b return b
[docs]def week_to_date(week_nr, daynr=1, year=None, frmt=None): """ date object for week see: timApp/tests/unit/test_datefilters.py :param week_nr: week number to get the date object :param daynr: day of week to get date :param year: year to get date :param frmt: extended format string :return: date object or formated string """ if week_nr <= 0: week_nr = date.today().isocalendar()[1] else: week_nr = int(week_nr) daynr = int(daynr) if not year: year = date.today().year else: year = int(year) d = date.fromisocalendar(year, week_nr, daynr) if frmt is None: return d return fmt_date(d, frmt)
[docs]def month_to_week(month, daynr=1, year=None): """ get week number for month see: timApp/tests/unit/test_datefilters.py :param month: month numer starting from 1 :param daynr: day number of month :param year: from what year :return: week number """ daynr = int(daynr) if month <= 0: month = date.today().month else: month = int(month) if not year: year = date.today().year else: year = int(year) d = date(year, month, daynr) return d.isocalendar()[1]
[docs]def now(frmt=0): """ Used in Jinja macros like tomorrow: %%1 | now%% Or this week %% "%w" | now %% :param frmt: format for current date or delta for current date :return: current date + (fmt as int) if fmt is int, otherwise current timestamp formated """ if isinstance(frmt, int): return datetime.now() + timedelta(days=frmt) return fmt_date(datetime.now(), str(frmt))
[docs]def fmt_date(d, frmt=""): """ Format date using extended %d1 and %m1 for one number values see: timApp/tests/unit/test_datefilters.py :param d: date to format :param frmt: Python format :return: string from d an format """ ds = "" + str(d.day) ms = "" + str(d.month) if frmt == "": return str(ds) + "." + str(ms) frmt = frmt.replace("%d1", ds).replace("%m1", ms) return d.strftime(frmt)
[docs]def week_to_text( week_nr, year=None, frmt=" %d1.%m1|", days="ma|ti|ke|to|pe|", first_day=1 ): """ Convert week to clendar header format see: timApp/tests/unit/test_datefilters.py :param week_nr: what week to convert :param year: what year :param frmt: extended Python date format :param days: pipe separated list of day names :param first_day: from what weekday to start :return: string suitable for calandar header """ if week_nr <= 0: week_nr = date.today().isocalendar()[1] else: week_nr = int(week_nr) if not year: year = date.today().year else: year = int(year) s = "" beg = 0 daynr = first_day first_empty = False while True: end = days.find("|", beg) if end == 0: first_empty = True if end < 0: s += days[beg:] break ds = fmt_date(week_to_date(week_nr, daynr, year), frmt) s += days[beg:end] + ds beg = end + 1 daynr += 1 if daynr > 7: if first_empty: # starts with separator, we need the last end = days.find("|", beg) if end < 0: end = 10000 s += days[beg:end] break return s
[docs]def postinc(v, delta=1): old = v[0] v[0] += delta return old
[docs]def preinc(v, delta=1): v[0] += delta return v[0]
# ------------------------ Jinja filters end ---------------------------------------------------------------
[docs]def expand_macros( text: str, macros, settings: DocSettings | None, env: TimSandboxedEnvironment, ignore_errors: bool = False, ): # return text # comment out when want to take time if this slows things charmacros = settings.get_charmacros() if settings else None if charmacros: for cm_key, cm_value in charmacros.items(): text = text.replace(cm_key, cm_value) if env.counters: text = env.counters.do_char_macros(text) if not has_macros(text, env): return text try: globalmacros = settings.get_globalmacros() if settings else None if globalmacros: for gmacro in globalmacros: macrotext = "%%" + gmacro + "%%" pos = text.find(macrotext) if pos >= 0: gm = str(globalmacros.get(gmacro, "")) text = text.replace(macrotext, gm) gm = str(globalmacros.get("ADDFOREVERY", "")) if gm: text = gm + "\n" + text startstr = env.comment_start_string + "LOCAL" beg = text.find(startstr) if beg >= 0: endstr = env.comment_end_string end = text.find(endstr, beg) if end >= 0: local_macros_yaml = text[beg + len(startstr) : end] local_macros = YamlBlock.from_markdown(local_macros_yaml).values macros = {**macros, **local_macros} # TODO: should local macros be used in counters??? if env.counters: env.counters.start_of_block() conv = env.from_string(text).render(macros) if env.counters and env.counters.need_update_labels: conv = env.counters.update_labels(conv) env.counters.is_plugin = False return conv except TemplateSyntaxError as e: if not ignore_errors: err = check_autonumber_error(e.message) if err is not None: return get_error_html(err) return get_error_html(f"Syntax error in macro template: {e}") return text except Exception as e: if not ignore_errors: # traceback.print_exc() return get_error_html(f"Error in expanding macros: {e}") return text
[docs]def belongs_placeholder(_s): return get_error_html("The belongs filter requires nocache=true attribute.")
[docs]def fmt(x, f: str): return format(x, f)
[docs]def get_document_id(doc_path: Any) -> int: from timApp.document.docentry import DocEntry if not isinstance(doc_path, str): return 0 doc = DocEntry.find_by_path( doc_path, docentry_load_opts=[load_only(DocEntry.id), lazyload(DocEntry._block)] ) return doc.id if doc else 0
[docs]def get_document_path(doc_id: Any) -> str: from timApp.document.docentry import DocEntry if isinstance(doc_id, int): doc_id_num = int(doc_id) elif isinstance(doc_id, str): doc_id_num = int(doc_id) else: return "" doc = DocEntry.find_by_id( doc_id_num, docentry_load_opts=[load_only(DocEntry.name), lazyload(DocEntry._block)], ) return doc.path if doc else ""
[docs]def url_quote(s: Any) -> str: if isinstance(s, str): return quote_plus(s) return ""
tim_filters = { "Pz": Pz, "gfields": genfields, "gfrange": gfrange, "srange": srange, "now": now, "w2date": week_to_date, "m2w": month_to_week, "w2text": week_to_text, "fmtdate": fmt_date, "preinc": preinc, "postinc": postinc, "belongs": belongs_placeholder, "fmt": fmt, "docid": get_document_id, "docpath": get_document_path, "urlquote": url_quote, }
[docs]def create_environment( macro_delimiter: str, user_ctx: UserContext | None, view_ctx: ViewContext, macros: dict | None, doc: Document | None = None, ) -> TimSandboxedEnvironment: env = TimSandboxedEnvironment(macro_delimiter) env.filters.update(tim_filters) env.filters["isview"] = view_ctx.isview if macros: counters = AutoCounters(macros, doc) env.set_counters(counters) # used in print.py if user_ctx: env.filters["belongs"] = Belongs(user_ctx).belongs_to_group return env
[docs]def md_to_html( text: str, sanitize: bool = True, macros: dict[str, object] | None = None ) -> str: """Converts the specified markdown text to HTML. :param macros: The macros to use. :param sanitize: Whether the HTML should be sanitized. Default is True. :param text: The text to be converted. :return: A HTML string. """ text = expand_macros( text, macros, settings=None, env=create_environment( "%%", user_ctx=None, view_ctx=default_view_ctx, macros=macros ), ) raw = call_dumbo([text]) if sanitize: return sanitize_html(str(raw[0])) else: return raw[0]
[docs]def par_list_to_html_list( pars: list[DocParagraph], settings: DocSettings, view_ctx: ViewContext, auto_macros: Iterable[dict] | None = None, ): """Converts the specified list of DocParagraphs to an HTML list. :param view_ctx: :return: A list of HTML strings. :param settings: The document settings. :param auto_macros: Currently a list(dict) containing the heading information ('h': dict(int,int) of heading counts and 'headings': dict(str,int) of so-far used headings and their counts). :param pars: The list of DocParagraphs to be converted. """ macroinfo = settings.get_macroinfo(view_ctx) # User-specific macros (such as %%username%% and %%realname%%) cannot be replaced here because the result will go # to global cache. We will replace them later (in post_process_pars). macroinfo.preserve_user_macros = True dumbo_opts = settings.get_dumbo_options() texts = [ p.get_expanded_markdown(macroinfo) if not p.has_dumbo_options() else { "content": p.get_expanded_markdown(macroinfo), **p.get_dumbo_options(base_opts=dumbo_opts).dict(), } for p in pars ] texplain = settings.is_texplain() textplain = settings.is_textplain() if texplain or textplain: # add pre-markers to tex paragrpahs for i in range(0, len(texts)): text = texts[i] if text.find("```") != 0 and text.find("#") != 0: texts[i] = "```\n" + text + "\n```" raw = call_dumbo(texts, options=dumbo_opts) # Edit html after dumbo raw = edit_html_with_own_syntax(raw) if auto_macros: processed = [] for pre_html, m, attrs in zip(raw, auto_macros, (p.get_attrs() for p in pars)): if "nonumber" in attrs.get("classes", []): final_html = pre_html else: final_html = insert_heading_numbers( pre_html, m, settings.auto_number_headings() > 0, settings.heading_format(), initial_heading_counts=settings.auto_number_start(), ) processed.append(final_html) raw = processed return raw
# Does changes to html after Dumbo and returns edited html
[docs]def edit_html_with_own_syntax(raw: list) -> list: index = 0 while index < len(raw): html_text = raw[index] raw[index] = make_slide_fragments(html_text) # raw[index] = check_and_edit_html_if_surrounded_with(text, fragment_string, change_classes_to_fragment) index += 1 return raw
# Adds the necessary html to make slide fragments work with reveal.js
[docs]def make_slide_fragments(html_text: str) -> str: # TODO: Make algorithm work with more than 2 levels of fragments # TODO: Make different styles of fragments available, possible syntax could be §§{shrink} or something # TODO: Refactor to make this more reusable # TODO: Make sure that this doesn't break latex conversion # Split from fragment area start tag <§ fragments = html_text.split("&lt;§") # If no fragment areas were found we look for fragment pieces if len(fragments) < 2: new_html = check_and_edit_html_if_surrounded_with( html_text, "§§", change_classes_to_fragment ) return new_html else: index = 1 # For every fragment area while index < len(fragments): # Try to find area end index_of_area_end = fragments[index].find("§&gt;") # If not found if index_of_area_end == -1: # Look for normal fragments fragments[index] = check_and_edit_html_if_surrounded_with( fragments[index], "§§", change_classes_to_fragment ) else: # Make a new fragment area if start and end found fragments[index] = '</p><div class="fragment"><p>' + fragments[index] fragments[index] = fragments[index].replace("§&gt;", "</p></div><p>", 1) # Look for inner fragments fragments[index] = check_and_edit_html_if_surrounded_with( fragments[index], "§§", change_classes_to_fragment ) index += 1 new_html = "".join(fragments) return new_html
# Checks if html element's content is surrounded with given string and edits it accordingly
[docs]def check_and_edit_html_if_surrounded_with( html_content: str, string_delimeter: str, editing_function ) -> str: # List of strings after splitting html from html_list = html_content.split(string_delimeter) if len(html_list) < 2: return html_content else: # Edit the list with given function new_html = editing_function(html_list) return new_html
[docs]def change_classes_to_fragment(html_list: list) -> str: """If found, html_list[1] will have the content that we need to make a fragment of and html_list[0] might have the element tag that will have "fragment" added to it's class. There might be multiple fragments in the html list. """ # Start from 1, the previous will contain the html tag to change index = 1 while index < len(html_list): # Changes html element's class to fragment new_htmls = change_class(html_list[index - 1], html_list[index], "fragment") # Apply changes html_list[index - 1] = new_htmls[0] html_list[index] = new_htmls[1] index += 2 # Join the list into a string new_html = "".join(html_list) return new_html
[docs]def change_class( text_containing_html_tag: str, text_content: str, new_class: str ) -> list: """Find the last html tag in the list and change that element's class to new_class or add the new class to element's classes or surround the new content with span element with the new class.""" try: # Find where the html tag supposedly ends index_of_tag_end = text_containing_html_tag.rfind(">") # Find where the html tag starts index_of_tag_start = text_containing_html_tag.rfind("<", 0, index_of_tag_end) # If the previous text ends a html tag if index_of_tag_end == len(text_containing_html_tag) - 1: # Html tag content is between those 2 indices html_tag = text_containing_html_tag[index_of_tag_start:index_of_tag_end] # Check if element already has atleast one class, if it does then add new_class if "class=" in html_tag: # Add the new class to html element classes index_of_class = html_tag.rfind("class=") text_containing_html_tag = ( text_containing_html_tag[ : (index_of_tag_start + index_of_class + 7) ] + new_class + " " + text_containing_html_tag[ (index_of_tag_start + index_of_class + 7) : ] ) else: # If there isn't class in html tag we add that and the new class text_containing_html_tag = ( text_containing_html_tag[:index_of_tag_end] + ' class="' + new_class + '"' + text_containing_html_tag[index_of_tag_end:] ) else: text_content = '<span class="' + new_class + '">' + text_content + "</span>" # If there is an error we do nothing but return the original text except ValueError: pass return [text_containing_html_tag, text_content]
[docs]def insert_heading_numbers( html_str: str, heading_info, auto_number_headings: int | bool = True, heading_format: dict | None = None, initial_heading_counts: dict[int, int] | None = None, ): """Applies the given heading_format to the HTML if it is a heading, based on the given heading_info. Additionally corrects the id attribute of the heading in case it has been used earlier. :param heading_info: A dict containing the heading information ('h': dict(int,int) of heading counts and 'headings': dict(str,int) of so-far used headings and their counts). :param html_str: The HTML string to be processed. :param auto_number_headings: Whether the headings should be formatted at all. :param heading_format: A dict(int,str) of the heading formats to be used. :param initial_heading_counts: Initial heading counter value for each level :return: The HTML with the formatted headings. """ tree = html.fragment_fromstring(presanitize_html_body(html_str), create_parent=True) counts = heading_info["h"] used = heading_info["headings"] for e in tree.iterchildren(): is_heading = e.tag in HEADING_TAGS if not is_heading: continue curr_id = title_to_id(e.text) hcount = used.get(curr_id, 0) if hcount > 0: try: e.attrib["id"] += "-" + str(hcount) except KeyError: e.set("id", f"{curr_id}-{hcount}") if auto_number_headings: e.text = format_heading( e.text or "", int(e.tag[1]), counts, heading_format, initial_counts=initial_heading_counts, ) final_html = etree.tostring(tree) return final_html
[docs]def format_heading( text, level, counts, heading_format, heading_ref_format: dict = None, jump_name: str = None, counters: AutoCounters = None, initial_counts: dict[int, int] | None = None, ): counts[level] += 1 for i in range(level + 1, 7): counts[i] = initial_counts.get(i, 0) values = {"text": text} add_h_values(counts, values) try: formatted = heading_format[level].format(**values) if heading_ref_format and jump_name and counters: formatted_ref = heading_ref_format[level].format(**values) counters.add_counter("chap", jump_name, formatted_ref, formatted) except (KeyError, ValueError, IndexError): formatted = "[ERROR] " + text return formatted