Source code for timApp.printing.pandoc_imagefilepathsfilter

#!/usr/bin/env python3

"""
Pandoc filter to convert image sources to latex graphics source paths considering
the images location according to the set of following rules:

- If an image has an absolute path that points to the TIM machine, e.g. "http://<TIM-domain>/imagepath"
  or "<tim-domain>/imagepath", then....
- If an image has a relative path, e.g. "/images/1239854102", then....
- If an image points to a resource that resides at another host, simply convert the image
  to a simple link at the output. This is due to possible copyright infringements, as the images
  would othewrise be unrightly copied to the output document.

TODO: BETTER DOCUMENTATION

"""
import imghdr
import os
import re
import tempfile
import urllib.request
from subprocess import check_output, STDOUT

from pandocfilters import toJSONFilter, RawInline, Image, Link, Str

from timApp.defaultconfig import FILES_PATH
from timApp.document.randutils import hashfunc

APP_ROOT = "/service/timApp"

IMAGE_ROOT = os.path.join(APP_ROOT, FILES_PATH, "blocks")

# protocol + hostname
CURRENT_HOST_MACHINE = os.environ.get("TIM_HOST", None)

ALLOWED_EXTERNAL_HOSTS = []

PRINTING_WHITELIST_FILE = os.path.join(APP_ROOT, ".printing_whitelist.config")

urlmaps = [
    {"url": "/csstatic/", "dir": "/service/timApp/modules/cs/static/"},
    {"url": "/csgenerated/", "dir": "/service/timApp/modules/cs/generated/"},
    {"url": "/static/", "dir": "/service/timApp/static/"},
    {"url": "/images/", "dir": "/tim_files/blocks/images/"},
    {"url": "/files/", "dir": "/tim_files/blocks/files/"},
]


[docs]def init_whitelist():
    """Init whitelist for trusted image source domains."""

    # s = ""  # just a test for env variables
    # for a in os.environ:
    #     s += 'Var: ' + a + ' Value: ' +  os.getenv(a) + "\n"
    # open("Output.txt", "a").write("Environment:" + s)

    if not os.path.exists(PRINTING_WHITELIST_FILE):
        try:
            os.makedirs(os.path.dirname(PRINTING_WHITELIST_FILE))
        except OSError:
            pass

        try:
            open(PRINTING_WHITELIST_FILE, "a").close()
        except OSError:
            pass

    content = []
    try:
        with open(PRINTING_WHITELIST_FILE) as f:
            content = f.readlines()
    except OSError:
        pass

    return [x.strip() for x in content]


# Get the os temp directoryls
TEMP_DIR_PATH = tempfile.gettempdir()
DOWNLOADED_IMAGES_ROOT = os.path.join(TEMP_DIR_PATH, "tim-img-dls")

texdocid = None


[docs]def convert_svg_to_pdf(image_path):
    path = os.path.dirname(image_path)
    # TODO: muista tarkistaa että jos pdf jo on, niin ei tehdä uudelleen!!!
    temp = image_path.replace(".svg", ".tmp.html")
    pdf = image_path.replace(".svg", ".pdf")
    if os.path.isfile(pdf):
        pdftime = os.path.getmtime(pdf)
        svgtime = os.path.getmtime(image_path)
        if pdftime > svgtime:
            return pdf

    # Make html to avoid headers and footers from PDF-image
    html = """<html>
  <head>
    <style>
body {
  margin: 0;
}
    </style>
    <script>
function init() {
  const element = document.getElementById('targetsvg');
  const positionInfo = element.getBoundingClientRect();
  const height = positionInfo.height;
  const width = positionInfo.width;
  const style = document.createElement('style');
  style.innerHTML = `@page {margin: 0; size: ${width}px ${height+1}px}`;
  document.head.appendChild(style);
}
window.onload = init;
    </script>
  </head>
  <body>
    <img id="targetsvg" src="SVGIMAGE">
  </body>
</html>
"""
    # string.format does not work because {} is needed for js
    html = html.replace("SVGIMAGE", image_path)
    open(temp, "w").writelines(html)
    cmd = [
        "/opt/google/chrome/chrome",
        "--no-sandbox",
        "--headless",
        "--disable-gpu",
        f"--print-to-pdf={pdf}",
        temp,
    ]
    # cmd = "./svg2pdf.sh {} {}".format(image_path, pdf)
    output = check_output(cmd, stderr=STDOUT, cwd=path)
    os.remove(temp)
    return pdf


[docs]def handle_images(key, value, fmt, meta):
    # open("Output.txt", "a").write("Meta:" + str(meta) + "\n")

    if key == "Image" and fmt == "latex":
        (attrs, alt_text_inlines, target) = value
        (url, title) = target

        # For debugging:
        # return Image(attrs, alt_text_inlines, ["notarealhost.juupahuu.com/image.png", ""])

        image_path = ""

        parsed_url = urlparse(url)
        parsed_cur = urlparse(CURRENT_HOST_MACHINE + "/kukku")
        curhost = parsed_cur.hostname or ""

        scheme = parsed_url.scheme or ""
        host = parsed_url.hostname or ""
        path = parsed_url.path or ""

        image_path = ""

        for urlmap in urlmaps:
            urlbeg = urlmap.get("url")
            if path.startswith(urlbeg):
                image_path = path.replace(urlbeg, urlmap.get("dir"))
                break

        # open("Output.txt", "a").write("image_path: " + image_path + " host: " + host + "CHM: " + curhost + "\n")
        if host != "" and host != curhost:
            image_path = ""

        if image_path != "" and os.path.exists(image_path):
            image_path = image_path.replace("\\", "/")
            if image_path.endswith(".svg"):
                image_path = convert_svg_to_pdf(image_path)
            return Image(attrs, alt_text_inlines, [image_path, title])

        """
        # The first slash needs to be removed from the path in order for the joins to work properly
        if path.startswith('/'):
            path = path[1:]

        # handle internal absolute urls
        base_address = scheme + '://' if scheme != '' else ''
        base_address += host + '/' if host != '' else ''
        if (CURRENT_HOST_MACHINE is not None) and base_address == CURRENT_HOST_MACHINE:
            image_path = os.path.join(APP_ROOT, path)

        # handle internal relative urls
        elif (host == "") and os.path.exists(os.path.join(APP_ROOT, path)):
            image_path = os.path.join(APP_ROOT, path)

        elif (host == "") and os.path.exists(os.path.join(IMAGE_ROOT, path)):
            image_path = os.path.join(IMAGE_ROOT, path)
            # open("Output.txt", "a").write("host: " + host + "\n")

        # handle external urls
        else:
        """

        # Download images from allowed external urls to be attached to the document.
        allow = False
        for h in ALLOWED_EXTERNAL_HOSTS:
            # open("Output.txt", "a").write("try image: " + h + " -> " + url + "\n")
            if re.match(h, url):
                allow = True
                break

        if allow:
            # open("Output.txt", "a").write("Check texdocid \n")
            global texdocid  # check if we already have path for doc id
            if not texdocid:
                m = meta.get(
                    "texdocid", None
                )  # if we do not have, get the path from meta data
                # open("Output.txt", "a").write("m:" + str(m) + "\n")
                if m:
                    texdocid = str(m.get("c", "xx"))
                # open("Output.txt", "a").write("texdocid:" + texdocid + "\n")

            images_root = os.path.join(DOWNLOADED_IMAGES_ROOT, texdocid)
            # create folder for image dls, if it does not exist already
            if not os.path.exists(images_root):
                os.makedirs(images_root)

            # download img to the folder and give the file a unique name (hash the url)
            img_uid = hashfunc(url)
            try:
                _, ext = os.path.splitext(url)
                img_dl_path = os.path.join(images_root, str(img_uid) + ext)
                # open("Output.txt", "a").write("img_dl_path = " + img_dl_path + "\n")

                if not os.path.exists(img_dl_path):
                    # open("Output.txt", "a").write("retrieve: " + url + " -> " + img_dl_path + "\n")
                    urllib.request.urlretrieve(url, img_dl_path)
                    if not ext:
                        img_type = imghdr.what(img_dl_path)
                        if img_type:
                            img_dl_path_ext = f"{img_dl_path}.{img_type}"
                            # open("Output.txt", "a").write("img_dl_path_ext = " + img_dl_path_ext + "\n")
                            os.symlink(img_dl_path, img_dl_path_ext)
                            img_dl_path = img_dl_path_ext

                    # urllib.URLopener().retrieve(url, img_dl_path)

                img_dl_path = img_dl_path.replace(
                    "\\", "/"
                )  # Ensure UNIX form for pandoc
                if img_dl_path.endswith(".svg"):
                    img_dl_path = convert_svg_to_pdf(img_dl_path)
                return Image(attrs, alt_text_inlines, [img_dl_path, title])

            except OSError:
                # could not download image, so display the image as a link to the imageURL
                pass
            except:
                pass

            # For other external images, transform the element to appear as a link
            # to the image resource in the LaTeX-output.
            return [
                RawInline("latex", r"\externalimagelink{"),
                Link(attrs, [Str(url)], [url, title]),
                RawInline("latex", "}"),
            ]

        # Makes sure the paths are in the UNIX form, as that is what LaTeX uses for paths even on Windows
        image_path = image_path.replace("\\", "/")

        return Image(attrs, alt_text_inlines, [image_path, title])


if __name__ == "__main__":

    # Needs to import different package based on python version, as the urlparse method
    # was moved from urlparse module to urllib.parse between python2.7 -> python3
    try:
        from urllib.parse import urlparse
    except ImportError:
        from urlparse import urlparse

    ALLOWED_EXTERNAL_HOSTS = init_whitelist()

    toJSONFilter(handle_images)
TIM documentation

Source code for timApp.printing.pandoc_imagefilepathsfilter