From a9500adcee70d7dc3f4da3a4185a6e34f6ba0618 Mon Sep 17 00:00:00 2001 From: Derek Hsu Date: Sat, 28 Jul 2012 15:44:40 +0800 Subject: [PATCH 01/16] modified paragraph, search and replace, relationship... Add multi-line text support to paragraph. Different lines are splited into different s. Modified paragraph to support more complicated style like color and font size. Improved advSearch and advReplace algorithem. Support extracting relation and contenttype form .docx file Add findElementByText function --- docx.py | 1878 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 993 insertions(+), 885 deletions(-) diff --git a/docx.py b/docx.py index 0f41b09..d58b5f8 100755 --- a/docx.py +++ b/docx.py @@ -1,885 +1,993 @@ -#!/usr/bin/env python2.6 -# -*- coding: utf-8 -*- -''' -Open and modify Microsoft Word 2007 docx files (called 'OpenXML' and 'Office OpenXML' by Microsoft) - -Part of Python's docx module - http://github.com/mikemaccana/python-docx -See LICENSE for licensing information. -''' - -import logging -from lxml import etree -try: - from PIL import Image -except ImportError: - import Image -import zipfile -import shutil -import re -import time -import os -from os.path import join - -log = logging.getLogger(__name__) - -# Record template directory's location which is just 'template' for a docx -# developer or 'site-packages/docx-template' if you have installed docx -template_dir = join(os.path.dirname(__file__),'docx-template') # installed -if not os.path.isdir(template_dir): - template_dir = join(os.path.dirname(__file__),'template') # dev - -# All Word prefixes / namespace matches used in document.xml & core.xml. -# LXML doesn't actually use prefixes (just the real namespace) , but these -# make it easier to copy Word output more easily. -nsprefixes = { - # Text Content - 'mv':'urn:schemas-microsoft-com:mac:vml', - 'mo':'http://schemas.microsoft.com/office/mac/office/2008/main', - 've':'http://schemas.openxmlformats.org/markup-compatibility/2006', - 'o':'urn:schemas-microsoft-com:office:office', - 'r':'http://schemas.openxmlformats.org/officeDocument/2006/relationships', - 'm':'http://schemas.openxmlformats.org/officeDocument/2006/math', - 'v':'urn:schemas-microsoft-com:vml', - 'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main', - 'w10':'urn:schemas-microsoft-com:office:word', - 'wne':'http://schemas.microsoft.com/office/word/2006/wordml', - # Drawing - 'wp':'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', - 'a':'http://schemas.openxmlformats.org/drawingml/2006/main', - 'pic':'http://schemas.openxmlformats.org/drawingml/2006/picture', - # Properties (core and extended) - 'cp':"http://schemas.openxmlformats.org/package/2006/metadata/core-properties", - 'dc':"http://purl.org/dc/elements/1.1/", - 'dcterms':"http://purl.org/dc/terms/", - 'dcmitype':"http://purl.org/dc/dcmitype/", - 'xsi':"http://www.w3.org/2001/XMLSchema-instance", - 'ep':'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties', - # Content Types (we're just making up our own namespaces here to save time) - 'ct':'http://schemas.openxmlformats.org/package/2006/content-types', - # Package Relationships (we're just making up our own namespaces here to save time) - 'pr':'http://schemas.openxmlformats.org/package/2006/relationships' - } - -def opendocx(file): - '''Open a docx file, return a document XML tree''' - mydoc = zipfile.ZipFile(file) - xmlcontent = mydoc.read('word/document.xml') - document = etree.fromstring(xmlcontent) - return document - -def newdocument(): - document = makeelement('document') - document.append(makeelement('body')) - return document - -def makeelement(tagname,tagtext=None,nsprefix='w',attributes=None,attrnsprefix=None): - '''Create an element & return it''' - # Deal with list of nsprefix by making namespacemap - namespacemap = None - if isinstance(nsprefix, list): - namespacemap = {} - for prefix in nsprefix: - namespacemap[prefix] = nsprefixes[prefix] - nsprefix = nsprefix[0] # FIXME: rest of code below expects a single prefix - if nsprefix: - namespace = '{'+nsprefixes[nsprefix]+'}' - else: - # For when namespace = None - namespace = '' - newelement = etree.Element(namespace+tagname, nsmap=namespacemap) - # Add attributes with namespaces - if attributes: - # If they haven't bothered setting attribute namespace, use an empty string - # (equivalent of no namespace) - if not attrnsprefix: - # Quick hack: it seems every element that has a 'w' nsprefix for its tag uses the same prefix for it's attributes - if nsprefix == 'w': - attributenamespace = namespace - else: - attributenamespace = '' - else: - attributenamespace = '{'+nsprefixes[attrnsprefix]+'}' - - for tagattribute in attributes: - newelement.set(attributenamespace+tagattribute, attributes[tagattribute]) - if tagtext: - newelement.text = tagtext - return newelement - -def pagebreak(type='page', orient='portrait'): - '''Insert a break, default 'page'. - See http://openxmldeveloper.org/forums/thread/4075.aspx - Return our page break element.''' - # Need to enumerate different types of page breaks. - validtypes = ['page', 'section'] - if type not in validtypes: - raise ValueError('Page break style "%s" not implemented. Valid styles: %s.' % (type, validtypes)) - pagebreak = makeelement('p') - if type == 'page': - run = makeelement('r') - br = makeelement('br',attributes={'type':type}) - run.append(br) - pagebreak.append(run) - elif type == 'section': - pPr = makeelement('pPr') - sectPr = makeelement('sectPr') - if orient == 'portrait': - pgSz = makeelement('pgSz',attributes={'w':'12240','h':'15840'}) - elif orient == 'landscape': - pgSz = makeelement('pgSz',attributes={'h':'12240','w':'15840', 'orient':'landscape'}) - sectPr.append(pgSz) - pPr.append(sectPr) - pagebreak.append(pPr) - return pagebreak - -def paragraph(paratext,style='BodyText',breakbefore=False,jc='left'): - '''Make a new paragraph element, containing a run, and some text. - Return the paragraph element. - - @param string jc: Paragraph alignment, possible values: - left, center, right, both (justified), ... - see http://www.schemacentral.com/sc/ooxml/t-w_ST_Jc.html - for a full list - - If paratext is a list, spawn multiple run/text elements. - Support text styles (paratext must then be a list of lists in the form - /