Source code for akvo.rsr.views.py_reports.docx_utils

# -*- coding: utf-8 -*-

"""Akvo RSR is covered by the GNU Affero General Public License.

See more details in the license.txt file located at the root folder of the
Akvo RSR module. For additional details on the GNU license please
see < http://www.gnu.org/licenses/agpl.html >.
"""

import base64
import binascii
import io
import re
import requests

from docx.document import Document
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.enum.text import WD_BREAK
from docx.image.exceptions import (
    UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError)
from docx.image.image import Image
from docx.table import _Cell
from docx.text.paragraph import Paragraph
from docx.opc.constants import RELATIONSHIP_TYPE
from docx.oxml.shared import OxmlElement, qn
from markdown import markdown
from lxml.html import fromstring


[docs]def load_image(src): """ load image from base64 data or url """ image_bytes = load_inline_image(src) \ if src.startswith("data:") \ else load_external_image(src) return make_image(image_bytes)
[docs]def load_external_image(src): image_data = None try: response = requests.get(src, stream=True) image_data = response.content except (requests.RequestException, IOError): pass return image_data
[docs]def load_inline_image(src): image_data = None header_data = src.split(";base64,", maxsplit=1) if len(header_data) == 2: data = header_data[1] try: image_data = base64.b64decode(data, validate=True) except (binascii.Error, ValueError): pass return image_data
[docs]def make_image(data): image_buffer = None if data: image_buffer = io.BytesIO(data) try: Image.from_blob(image_buffer.getbuffer()) except (UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError): image_buffer = None if not image_buffer: broken_img = load_inline_image(IMG_BLANK) image_buffer = io.BytesIO(broken_img) return image_buffer
[docs]def set_repeat_table_header(row): """ set repeat table row on every page """ tr = row._tr trPr = tr.get_or_add_trPr() tblHeader = OxmlElement('w:tblHeader') tblHeader.set(qn('w:val'), "true") trPr.append(tblHeader) return row
[docs]def change_orientation(document): """ change document orientation from portrait to landscape and vice versa """ current_section = document.sections[-1] new_width, new_height = current_section.page_height, current_section.page_width new_section = document.add_section(WD_SECTION.NEW_PAGE) new_section.orientation = WD_ORIENT.LANDSCAPE \ if current_section.orientation == WD_ORIENT.PORTRAIT \ else WD_ORIENT.PORTRAIT new_section.page_width = new_width new_section.page_height = new_height return new_section
[docs]def markdown_to_docx(container, text): """ insert markdown text to docx document """ if not text: return builder = HTMLDocxBuilder(container) builder.feed(markdown(text))
[docs]class HTMLDocxBuilder(object): def __init__(self, container): self.root_container = container
[docs] def feed(self, html): root = fromstring(html) self.traverse(self.root_container, root)
[docs] def traverse(self, container, element): handler = get_tag_handler(element.tag) new_container = container if handler: new_container = handler.handle_text(container, element) for child in element: self.traverse(new_container, child) parent_handler = get_tag_handler(element.getparent().tag) if element.tail and parent_handler: parent_handler.handle_tail(container, element)
[docs]class ParagraphTagHandler(object): """ <p> creates a paragraph element inside a docx container element. """
[docs] def handle_text(self, container, element): paragraph = get_new_paragraph(container) return self._append_paragraph(element.text, element, paragraph)
[docs] def handle_tail(self, container, element): paragraph = get_current_paragraph(container) return self._append_paragraph(element.tail, element, paragraph)
def _append_paragraph(self, text, element, container): text = trim_whitespaces(text) if not text: return container style = None if element.getparent().tag == 'blockquote': style = 'IntenseQuote' container.add_run(text=text, style=style) return container
[docs]class HeadingTagHandler(object): """ <h1>...<h6> creates heading element inside a docx container element. """ def __init__(self, level): self.level = level
[docs] def handle_text(self, container, element): paragraph = get_new_paragraph(container) return self._append_heading(element.text, element, paragraph)
[docs] def handle_tail(self, container, element): paragraph = get_current_paragraph(container) return self._append_heading(element.text, element, paragraph)
def _append_heading(self, text, element, container): container.style = 'Heading {}'.format(self.level) text = trim_whitespaces(text) if not text: return container container.add_run(text=text) return container
[docs]class StrongTagHandler(object): """ <strong> Creates a bold text run inside the paragraph container. Appends remainder of text as a additional run """
[docs] def handle_text(self, container, element): return self._append_strong(element.text, element, container)
[docs] def handle_tail(self, container, element): return self._append_strong(element.tail, element, container)
def _append_strong(self, text, element, container): text = trim_whitespaces(text) run = container.add_run(text=text) run.bold = True if element.getparent().tag == 'em': run.italic = True return container
[docs]class EmphasisTagHandler(object): """ <em> Creates an italic text run inside the paragraph container. Appends remainder of text as a additional run """
[docs] def handle_text(self, container, element): return self._append_emphasis(element.text, element, container)
[docs] def handle_tail(self, container, element): return self._append_emphasis(element.tail, element, container)
def _append_emphasis(self, text, element, container): text = trim_whitespaces(text) run = container.add_run(text=text) run.italic = True if element.getparent().tag == 'strong': run.bold = True return container
[docs]class LineBreakTagHandler(object): """ <br> Creates a break item inside the given container. """
[docs] def handle_text(self, container, element): element.tail = trim_whitespaces(element.tail) element.tail = element.tail.lstrip() run = container.add_run() run.add_break(break_type=WD_BREAK.LINE_CLEAR_RIGHT) return container
[docs] def handle_tail(self, container, element): pass
[docs]class ListItemTagHandler(object): """ <li> Create a list item element inside a docx container. Style it according to its parents list type. """ def __init__(self): self.list_style = dict( ol='ListNumber', ul='ListBullet', )
[docs] def handle_text(self, container, element): paragraph = get_new_paragraph(container) return self._append_list_item(element, element.text, paragraph)
[docs] def handle_tail(self, container, element): paragraph = get_current_paragraph(container) return self._append_list_item(element, element.tail, paragraph)
def _append_list_item(self, element, text, container): text = trim_whitespaces(text) text = '' if text == ' ' else text style = self.list_style.get(element.getparent().tag, 'ListBullet') container.style = style container.add_run(text) return container
[docs]class DivTagHandler(object):
[docs] def handle_text(self, container, element): return self._append_run(element.text, container)
[docs] def handle_tail(self, container, element): return self._append_run(element.tail, container)
def _append_run(self, text, container): text = trim_whitespaces(text) text = '' if text == ' ' else text if text: container.add_run(text=text) return container
default_handler = DivTagHandler() _tag_handler_map = dict( p=ParagraphTagHandler(), em=EmphasisTagHandler(), br=LineBreakTagHandler(), li=ListItemTagHandler(), strong=StrongTagHandler(), h1=HeadingTagHandler(2), h2=HeadingTagHandler(3), h3=HeadingTagHandler(4), h4=HeadingTagHandler(5), h5=HeadingTagHandler(6), h6=HeadingTagHandler(7), )
[docs]def get_tag_handler(tag): return _tag_handler_map.get(tag, default_handler)
[docs]def get_current_paragraph(container): current_paragraph = container if isinstance(container, Paragraph): if isinstance(container._parent, _Cell): current_paragraph = container._parent.paragraphs[-1] if isinstance(container, Document): current_paragraph = container.add_paragraph() return current_paragraph
[docs]def get_new_paragraph(container): new_paragraph = container if isinstance(container, Paragraph): if isinstance(container._parent, _Cell): new_paragraph = container._parent.paragraphs[0] if len(container._parent.paragraphs) > 1: new_paragraph = container._parent.add_paragraph() else: if container._parent.paragraphs[0].text: new_paragraph = container._parent.add_paragraph() else: if container.text: new_paragraph = container._parent.add_paragraph() if isinstance(container, Document): new_paragraph = container.add_paragraph() return new_paragraph
[docs]def trim_whitespaces(text): """ replaces multiple whitespaces and line breaks by a single whitespace """ if text: text = ' '.join(text.split('\n')) text = re.sub(' +', ' ', text) return text if text else ''
IMG_BLANK = ""