Source code for timApp.printing.pandoc_imagefilepathsfilter

#!/usr/bin/env python3

"""
Pandoc filter to convert image sources to latex graphics source paths considering
the images location according to the set of following rules:

- If an image has an absolute path that points to the TIM machine, e.g. "http://<TIM-domain>/imagepath"
  or "<tim-domain>/imagepath", then....
- If an image has a relative path, e.g. "/images/1239854102", then....
- If an image points to a resource that resides at another host, simply convert the image
  to a simple link at the output. This is due to possible copyright infringements, as the images
  would othewrise be unrightly copied to the output document.

TODO: BETTER DOCUMENTATION

"""
import imghdr
import os
import re
import tempfile
import urllib.request
from subprocess import check_output, STDOUT

from pandocfilters import toJSONFilter, RawInline, Image, Link, Str

from timApp.defaultconfig import FILES_PATH
from timApp.document.randutils import hashfunc

APP_ROOT = "/service/timApp"

IMAGE_ROOT = os.path.join(APP_ROOT, FILES_PATH, "blocks")

# protocol + hostname
CURRENT_HOST_MACHINE = os.environ.get("TIM_HOST", None)

ALLOWED_EXTERNAL_HOSTS = []

PRINTING_WHITELIST_FILE = os.path.join(APP_ROOT, ".printing_whitelist.config")

urlmaps = [
    {"url": "/csstatic/", "dir": "/service/timApp/modules/cs/static/"},
    {"url": "/csgenerated/", "dir": "/service/timApp/modules/cs/generated/"},
    {"url": "/static/", "dir": "/service/timApp/static/"},
    {"url": "/images/", "dir": "/tim_files/blocks/images/"},
    {"url": "/files/", "dir": "/tim_files/blocks/files/"},
]


[docs]def init_whitelist(): """Init whitelist for trusted image source domains.""" # s = "" # just a test for env variables # for a in os.environ: # s += 'Var: ' + a + ' Value: ' + os.getenv(a) + "\n" # open("Output.txt", "a").write("Environment:" + s) if not os.path.exists(PRINTING_WHITELIST_FILE): try: os.makedirs(os.path.dirname(PRINTING_WHITELIST_FILE)) except OSError: pass try: open(PRINTING_WHITELIST_FILE, "a").close() except OSError: pass content = [] try: with open(PRINTING_WHITELIST_FILE) as f: content = f.readlines() except OSError: pass return [x.strip() for x in content]
# Get the os temp directoryls TEMP_DIR_PATH = tempfile.gettempdir() DOWNLOADED_IMAGES_ROOT = os.path.join(TEMP_DIR_PATH, "tim-img-dls") texdocid = None
[docs]def convert_svg_to_pdf(image_path): path = os.path.dirname(image_path) # TODO: muista tarkistaa että jos pdf jo on, niin ei tehdä uudelleen!!! temp = image_path.replace(".svg", ".tmp.html") pdf = image_path.replace(".svg", ".pdf") if os.path.isfile(pdf): pdftime = os.path.getmtime(pdf) svgtime = os.path.getmtime(image_path) if pdftime > svgtime: return pdf # Make html to avoid headers and footers from PDF-image html = """<html> <head> <style> body { margin: 0; } </style> <script> function init() { const element = document.getElementById('targetsvg'); const positionInfo = element.getBoundingClientRect(); const height = positionInfo.height; const width = positionInfo.width; const style = document.createElement('style'); style.innerHTML = `@page {margin: 0; size: ${width}px ${height+1}px}`; document.head.appendChild(style); } window.onload = init; </script> </head> <body> <img id="targetsvg" src="SVGIMAGE"> </body> </html> """ # string.format does not work because {} is needed for js html = html.replace("SVGIMAGE", image_path) open(temp, "w").writelines(html) cmd = [ "/opt/google/chrome/chrome", "--no-sandbox", "--headless", "--disable-gpu", f"--print-to-pdf={pdf}", temp, ] # cmd = "./svg2pdf.sh {} {}".format(image_path, pdf) output = check_output(cmd, stderr=STDOUT, cwd=path) os.remove(temp) return pdf
[docs]def handle_images(key, value, fmt, meta): # open("Output.txt", "a").write("Meta:" + str(meta) + "\n") if key == "Image" and fmt == "latex": (attrs, alt_text_inlines, target) = value (url, title) = target # For debugging: # return Image(attrs, alt_text_inlines, ["notarealhost.juupahuu.com/image.png", ""]) image_path = "" parsed_url = urlparse(url) parsed_cur = urlparse(CURRENT_HOST_MACHINE + "/kukku") curhost = parsed_cur.hostname or "" scheme = parsed_url.scheme or "" host = parsed_url.hostname or "" path = parsed_url.path or "" image_path = "" for urlmap in urlmaps: urlbeg = urlmap.get("url") if path.startswith(urlbeg): image_path = path.replace(urlbeg, urlmap.get("dir")) break # open("Output.txt", "a").write("image_path: " + image_path + " host: " + host + "CHM: " + curhost + "\n") if host != "" and host != curhost: image_path = "" if image_path != "" and os.path.exists(image_path): image_path = image_path.replace("\\", "/") if image_path.endswith(".svg"): image_path = convert_svg_to_pdf(image_path) return Image(attrs, alt_text_inlines, [image_path, title]) """ # The first slash needs to be removed from the path in order for the joins to work properly if path.startswith('/'): path = path[1:] # handle internal absolute urls base_address = scheme + '://' if scheme != '' else '' base_address += host + '/' if host != '' else '' if (CURRENT_HOST_MACHINE is not None) and base_address == CURRENT_HOST_MACHINE: image_path = os.path.join(APP_ROOT, path) # handle internal relative urls elif (host == "") and os.path.exists(os.path.join(APP_ROOT, path)): image_path = os.path.join(APP_ROOT, path) elif (host == "") and os.path.exists(os.path.join(IMAGE_ROOT, path)): image_path = os.path.join(IMAGE_ROOT, path) # open("Output.txt", "a").write("host: " + host + "\n") # handle external urls else: """ # Download images from allowed external urls to be attached to the document. allow = False for h in ALLOWED_EXTERNAL_HOSTS: # open("Output.txt", "a").write("try image: " + h + " -> " + url + "\n") if re.match(h, url): allow = True break if allow: # open("Output.txt", "a").write("Check texdocid \n") global texdocid # check if we already have path for doc id if not texdocid: m = meta.get( "texdocid", None ) # if we do not have, get the path from meta data # open("Output.txt", "a").write("m:" + str(m) + "\n") if m: texdocid = str(m.get("c", "xx")) # open("Output.txt", "a").write("texdocid:" + texdocid + "\n") images_root = os.path.join(DOWNLOADED_IMAGES_ROOT, texdocid) # create folder for image dls, if it does not exist already if not os.path.exists(images_root): os.makedirs(images_root) # download img to the folder and give the file a unique name (hash the url) img_uid = hashfunc(url) try: _, ext = os.path.splitext(url) img_dl_path = os.path.join(images_root, str(img_uid) + ext) # open("Output.txt", "a").write("img_dl_path = " + img_dl_path + "\n") if not os.path.exists(img_dl_path): # open("Output.txt", "a").write("retrieve: " + url + " -> " + img_dl_path + "\n") urllib.request.urlretrieve(url, img_dl_path) if not ext: img_type = imghdr.what(img_dl_path) if img_type: img_dl_path_ext = f"{img_dl_path}.{img_type}" # open("Output.txt", "a").write("img_dl_path_ext = " + img_dl_path_ext + "\n") os.symlink(img_dl_path, img_dl_path_ext) img_dl_path = img_dl_path_ext # urllib.URLopener().retrieve(url, img_dl_path) img_dl_path = img_dl_path.replace( "\\", "/" ) # Ensure UNIX form for pandoc if img_dl_path.endswith(".svg"): img_dl_path = convert_svg_to_pdf(img_dl_path) return Image(attrs, alt_text_inlines, [img_dl_path, title]) except OSError: # could not download image, so display the image as a link to the imageURL pass except: pass # For other external images, transform the element to appear as a link # to the image resource in the LaTeX-output. return [ RawInline("latex", r"\externalimagelink{"), Link(attrs, [Str(url)], [url, title]), RawInline("latex", "}"), ] # Makes sure the paths are in the UNIX form, as that is what LaTeX uses for paths even on Windows image_path = image_path.replace("\\", "/") return Image(attrs, alt_text_inlines, [image_path, title])
if __name__ == "__main__": # Needs to import different package based on python version, as the urlparse method # was moved from urlparse module to urllib.parse between python2.7 -> python3 try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse ALLOWED_EXTERNAL_HOSTS = init_whitelist() toJSONFilter(handle_images)