# -*- coding: utf-8 -*-
"""Akvo RSR is covered by the GNU Affero General Public License.
See more details in the license.txt file located at the root folder of the
Akvo RSR module. For additional details on the GNU license please
see < http://www.gnu.org/licenses/agpl.html >.
"""
import base64
import binascii
import io
import re
import requests
from docx.document import Document
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.enum.text import WD_BREAK
from docx.image.exceptions import (
UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError)
from docx.image.image import Image
from docx.table import _Cell
from docx.text.paragraph import Paragraph
from docx.opc.constants import RELATIONSHIP_TYPE
from docx.oxml.shared import OxmlElement, qn
from markdown import markdown
from lxml.html import fromstring
[docs]def load_image(src):
""" load image from base64 data or url
"""
image_bytes = load_inline_image(src) \
if src.startswith("data:") \
else load_external_image(src)
return make_image(image_bytes)
[docs]def load_external_image(src):
image_data = None
try:
response = requests.get(src, stream=True)
image_data = response.content
except (requests.RequestException, IOError):
pass
return image_data
[docs]def load_inline_image(src):
image_data = None
header_data = src.split(";base64,", maxsplit=1)
if len(header_data) == 2:
data = header_data[1]
try:
image_data = base64.b64decode(data, validate=True)
except (binascii.Error, ValueError):
pass
return image_data
[docs]def make_image(data):
image_buffer = None
if data:
image_buffer = io.BytesIO(data)
try:
Image.from_blob(image_buffer.getbuffer())
except (UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError):
image_buffer = None
if not image_buffer:
broken_img = load_inline_image(IMG_BLANK)
image_buffer = io.BytesIO(broken_img)
return image_buffer
[docs]def add_hyperlink(paragraph, url, text, color='0000FF', underline=True):
""" places a hyperlink within a paragraph object
"""
part = paragraph.part
r_id = part.relate_to(url, RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
hyperlink = OxmlElement('w:hyperlink')
hyperlink.set(qn('r:id'), r_id, )
new_run = OxmlElement('w:r')
rPr = OxmlElement('w:rPr')
if color is not None:
c = OxmlElement('w:color')
c.set(qn('w:val'), color)
rPr.append(c)
if not underline:
u = OxmlElement('w:u')
u.set(qn('w:val'), 'none')
rPr.append(u)
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
[docs]def change_orientation(document):
""" change document orientation from portrait to landscape and vice versa
"""
current_section = document.sections[-1]
new_width, new_height = current_section.page_height, current_section.page_width
new_section = document.add_section(WD_SECTION.NEW_PAGE)
new_section.orientation = WD_ORIENT.LANDSCAPE \
if current_section.orientation == WD_ORIENT.PORTRAIT \
else WD_ORIENT.PORTRAIT
new_section.page_width = new_width
new_section.page_height = new_height
return new_section
[docs]def markdown_to_docx(container, text):
""" insert markdown text to docx document
"""
if not text:
return
builder = HTMLDocxBuilder(container)
builder.feed(markdown(text))
[docs]class HTMLDocxBuilder(object):
def __init__(self, container):
self.root_container = container
[docs] def feed(self, html):
root = fromstring(html)
self.traverse(self.root_container, root)
[docs] def traverse(self, container, element):
handler = get_tag_handler(element.tag)
new_container = container
if handler:
new_container = handler.handle_text(container, element)
for child in element:
self.traverse(new_container, child)
parent_handler = get_tag_handler(element.getparent().tag)
if element.tail and parent_handler:
parent_handler.handle_tail(container, element)
[docs]class ParagraphTagHandler(object):
""" <p> creates a paragraph element inside a docx container element.
"""
[docs] def handle_text(self, container, element):
paragraph = get_new_paragraph(container)
return self._append_paragraph(element.text, element, paragraph)
[docs] def handle_tail(self, container, element):
paragraph = get_current_paragraph(container)
return self._append_paragraph(element.tail, element, paragraph)
def _append_paragraph(self, text, element, container):
text = trim_whitespaces(text)
if not text:
return container
style = None
if element.getparent().tag == 'blockquote':
style = 'IntenseQuote'
container.add_run(text=text, style=style)
return container
[docs]class HeadingTagHandler(object):
"""
<h1>...<h6> creates heading element inside a docx container element.
"""
def __init__(self, level):
self.level = level
[docs] def handle_text(self, container, element):
paragraph = get_new_paragraph(container)
return self._append_heading(element.text, element, paragraph)
[docs] def handle_tail(self, container, element):
paragraph = get_current_paragraph(container)
return self._append_heading(element.text, element, paragraph)
def _append_heading(self, text, element, container):
container.style = 'Heading {}'.format(self.level)
text = trim_whitespaces(text)
if not text:
return container
container.add_run(text=text)
return container
[docs]class StrongTagHandler(object):
"""
<strong> Creates a bold text run inside the paragraph container.
Appends remainder of text as a additional run
"""
[docs] def handle_text(self, container, element):
return self._append_strong(element.text, element, container)
[docs] def handle_tail(self, container, element):
return self._append_strong(element.tail, element, container)
def _append_strong(self, text, element, container):
text = trim_whitespaces(text)
run = container.add_run(text=text)
run.bold = True
if element.getparent().tag == 'em':
run.italic = True
return container
[docs]class EmphasisTagHandler(object):
"""
<em> Creates an italic text run inside the paragraph container.
Appends remainder of text as a additional run
"""
[docs] def handle_text(self, container, element):
return self._append_emphasis(element.text, element, container)
[docs] def handle_tail(self, container, element):
return self._append_emphasis(element.tail, element, container)
def _append_emphasis(self, text, element, container):
text = trim_whitespaces(text)
run = container.add_run(text=text)
run.italic = True
if element.getparent().tag == 'strong':
run.bold = True
return container
[docs]class LineBreakTagHandler(object):
""" <br> Creates a break item inside the given container.
"""
[docs] def handle_text(self, container, element):
element.tail = trim_whitespaces(element.tail)
element.tail = element.tail.lstrip()
run = container.add_run()
run.add_break(break_type=WD_BREAK.LINE_CLEAR_RIGHT)
return container
[docs] def handle_tail(self, container, element):
pass
[docs]class ListItemTagHandler(object):
"""
<li> Create a list item element inside a docx container.
Style it according to its parents list type.
"""
def __init__(self):
self.list_style = dict(
ol='ListNumber',
ul='ListBullet',
)
[docs] def handle_text(self, container, element):
paragraph = get_new_paragraph(container)
return self._append_list_item(element, element.text, paragraph)
[docs] def handle_tail(self, container, element):
paragraph = get_current_paragraph(container)
return self._append_list_item(element, element.tail, paragraph)
def _append_list_item(self, element, text, container):
text = trim_whitespaces(text)
text = '' if text == ' ' else text
style = self.list_style.get(element.getparent().tag, 'ListBullet')
container.style = style
container.add_run(text)
return container
[docs]class DivTagHandler(object):
[docs] def handle_text(self, container, element):
return self._append_run(element.text, container)
[docs] def handle_tail(self, container, element):
return self._append_run(element.tail, container)
def _append_run(self, text, container):
text = trim_whitespaces(text)
text = '' if text == ' ' else text
if text:
container.add_run(text=text)
return container
default_handler = DivTagHandler()
_tag_handler_map = dict(
p=ParagraphTagHandler(),
em=EmphasisTagHandler(),
br=LineBreakTagHandler(),
li=ListItemTagHandler(),
strong=StrongTagHandler(),
h1=HeadingTagHandler(2),
h2=HeadingTagHandler(3),
h3=HeadingTagHandler(4),
h4=HeadingTagHandler(5),
h5=HeadingTagHandler(6),
h6=HeadingTagHandler(7),
)
[docs]def get_tag_handler(tag):
return _tag_handler_map.get(tag, default_handler)
[docs]def get_current_paragraph(container):
current_paragraph = container
if isinstance(container, Paragraph):
if isinstance(container._parent, _Cell):
current_paragraph = container._parent.paragraphs[-1]
if isinstance(container, Document):
current_paragraph = container.add_paragraph()
return current_paragraph
[docs]def get_new_paragraph(container):
new_paragraph = container
if isinstance(container, Paragraph):
if isinstance(container._parent, _Cell):
new_paragraph = container._parent.paragraphs[0]
if len(container._parent.paragraphs) > 1:
new_paragraph = container._parent.add_paragraph()
else:
if container._parent.paragraphs[0].text:
new_paragraph = container._parent.add_paragraph()
else:
if container.text:
new_paragraph = container._parent.add_paragraph()
if isinstance(container, Document):
new_paragraph = container.add_paragraph()
return new_paragraph
[docs]def trim_whitespaces(text):
""" replaces multiple whitespaces and line breaks by a single whitespace
"""
if text:
text = ' '.join(text.split('\n'))
text = re.sub(' +', ' ', text)
return text if text else ''
IMG_BLANK = ""