Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 39560
1
#!/usr/bin/python
2
###############################################################################
3
#
4
# CoCalc: Collaborative Calculation in the Cloud
5
#
6
# Copyright (C) 2016, Sagemath Inc.
7
#
8
# This program is free software: you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation, either version 3 of the License, or
11
# (at your option) any later version.
12
#
13
# This program is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
17
#
18
# You should have received a copy of the GNU General Public License
19
# along with this program. If not, see <http://www.gnu.org/licenses/>.
20
#
21
###############################################################################
22
23
24
25
# -*- coding: utf-8 -*-
26
"""
27
Open and modify Microsoft Word 2007 docx files (called 'OpenXML' and
28
'Office OpenXML' by Microsoft)
29
30
Part of Python's docx module - http://github.com/mikemaccana/python-docx
31
See LICENSE for licensing information.
32
"""
33
34
import logging
35
from lxml import etree
36
try:
37
from PIL import Image
38
except ImportError:
39
import Image
40
import zipfile
41
import shutil
42
import re
43
import time
44
import os
45
from os.path import join
46
47
log = logging.getLogger(__name__)
48
49
# Record template directory's location which is just 'template' for a docx
50
# developer or 'site-packages/docx-template' if you have installed docx
51
template_dir = join(os.path.dirname(__file__), 'docx-template') # installed
52
if not os.path.isdir(template_dir):
53
template_dir = join(os.path.dirname(__file__), 'template') # dev
54
55
# All Word prefixes / namespace matches used in document.xml & core.xml.
56
# LXML doesn't actually use prefixes (just the real namespace) , but these
57
# make it easier to copy Word output more easily.
58
nsprefixes = {
59
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
60
'o': 'urn:schemas-microsoft-com:office:office',
61
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
62
# Text Content
63
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
64
'w10': 'urn:schemas-microsoft-com:office:word',
65
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
66
# Drawing
67
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
68
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
69
'mv': 'urn:schemas-microsoft-com:mac:vml',
70
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
71
'v': 'urn:schemas-microsoft-com:vml',
72
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
73
# Properties (core and extended)
74
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
75
'dc': 'http://purl.org/dc/elements/1.1/',
76
'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
77
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
78
# Content Types
79
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
80
# Package Relationships
81
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
82
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
83
# Dublin Core document properties
84
'dcmitype': 'http://purl.org/dc/dcmitype/',
85
'dcterms': 'http://purl.org/dc/terms/'}
86
87
88
def opendocx(file):
89
'''Open a docx file, return a document XML tree'''
90
mydoc = zipfile.ZipFile(file)
91
xmlcontent = mydoc.read('word/document.xml')
92
document = etree.fromstring(xmlcontent)
93
return document
94
95
96
def newdocument():
97
document = makeelement('document')
98
document.append(makeelement('body'))
99
return document
100
101
102
def makeelement(tagname, tagtext=None, nsprefix='w', attributes=None, attrnsprefix=None):
103
'''Create an element & return it'''
104
# Deal with list of nsprefix by making namespacemap
105
namespacemap = None
106
if isinstance(nsprefix, list):
107
namespacemap = {}
108
for prefix in nsprefix:
109
namespacemap[prefix] = nsprefixes[prefix]
110
# FIXME: rest of code below expects a single prefix
111
nsprefix = nsprefix[0]
112
if nsprefix:
113
namespace = '{'+nsprefixes[nsprefix]+'}'
114
else:
115
# For when namespace = None
116
namespace = ''
117
newelement = etree.Element(namespace+tagname, nsmap=namespacemap)
118
# Add attributes with namespaces
119
if attributes:
120
# If they haven't bothered setting attribute namespace, use an empty string
121
# (equivalent of no namespace)
122
if not attrnsprefix:
123
# Quick hack: it seems every element that has a 'w' nsprefix for its tag uses the same prefix for it's attributes
124
if nsprefix == 'w':
125
attributenamespace = namespace
126
else:
127
attributenamespace = ''
128
else:
129
attributenamespace = '{'+nsprefixes[attrnsprefix]+'}'
130
131
for tagattribute in attributes:
132
newelement.set(attributenamespace+tagattribute, attributes[tagattribute])
133
if tagtext:
134
newelement.text = tagtext
135
return newelement
136
137
138
def pagebreak(type='page', orient='portrait'):
139
'''Insert a break, default 'page'.
140
See http://openxmldeveloper.org/forums/thread/4075.aspx
141
Return our page break element.'''
142
# Need to enumerate different types of page breaks.
143
validtypes = ['page', 'section']
144
if type not in validtypes:
145
tmpl = 'Page break style "%s" not implemented. Valid styles: %s.'
146
raise ValueError(tmpl % (type, validtypes))
147
pagebreak = makeelement('p')
148
if type == 'page':
149
run = makeelement('r')
150
br = makeelement('br', attributes={'type': type})
151
run.append(br)
152
pagebreak.append(run)
153
elif type == 'section':
154
pPr = makeelement('pPr')
155
sectPr = makeelement('sectPr')
156
if orient == 'portrait':
157
pgSz = makeelement('pgSz', attributes={'w': '12240', 'h': '15840'})
158
elif orient == 'landscape':
159
pgSz = makeelement('pgSz', attributes={'h': '12240', 'w': '15840',
160
'orient': 'landscape'})
161
sectPr.append(pgSz)
162
pPr.append(sectPr)
163
pagebreak.append(pPr)
164
return pagebreak
165
166
167
def paragraph(paratext, style='BodyText', breakbefore=False, jc='left'):
168
'''Make a new paragraph element, containing a run, and some text.
169
Return the paragraph element.
170
171
@param string jc: Paragraph alignment, possible values:
172
left, center, right, both (justified), ...
173
see http://www.schemacentral.com/sc/ooxml/t-w_ST_Jc.html
174
for a full list
175
176
If paratext is a list, spawn multiple run/text elements.
177
Support text styles (paratext must then be a list of lists in the form
178
<text> / <style>. Stile is a string containing a combination od 'bui' chars
179
180
example
181
paratext =\
182
[ ('some bold text', 'b')
183
, ('some normal text', '')
184
, ('some italic underlined text', 'iu')
185
]
186
187
'''
188
# Make our elements
189
paragraph = makeelement('p')
190
191
if isinstance(paratext, list):
192
text = []
193
for pt in paratext:
194
if isinstance(pt, (list, tuple)):
195
text.append([makeelement('t', tagtext=pt[0]), pt[1]])
196
else:
197
text.append([makeelement('t', tagtext=pt), ''])
198
else:
199
text = [[makeelement('t', tagtext=paratext), ''], ]
200
pPr = makeelement('pPr')
201
pStyle = makeelement('pStyle', attributes={'val': style})
202
pJc = makeelement('jc', attributes={'val': jc})
203
pPr.append(pStyle)
204
pPr.append(pJc)
205
206
# Add the text the run, and the run to the paragraph
207
paragraph.append(pPr)
208
for t in text:
209
run = makeelement('r')
210
rPr = makeelement('rPr')
211
# Apply styles
212
if t[1].find('b') > -1:
213
b = makeelement('b')
214
rPr.append(b)
215
if t[1].find('u') > -1:
216
u = makeelement('u', attributes={'val': 'single'})
217
rPr.append(u)
218
if t[1].find('i') > -1:
219
i = makeelement('i')
220
rPr.append(i)
221
run.append(rPr)
222
# Insert lastRenderedPageBreak for assistive technologies like
223
# document narrators to know when a page break occurred.
224
if breakbefore:
225
lastRenderedPageBreak = makeelement('lastRenderedPageBreak')
226
run.append(lastRenderedPageBreak)
227
run.append(t[0])
228
paragraph.append(run)
229
# Return the combined paragraph
230
return paragraph
231
232
233
def contenttypes():
234
types = etree.fromstring(
235
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/conten'
236
't-types"></Types>')
237
parts = {
238
'/word/theme/theme1.xml': 'application/vnd.openxmlformats-officedocu'
239
'ment.theme+xml',
240
'/word/fontTable.xml': 'application/vnd.openxmlformats-officedocu'
241
'ment.wordprocessingml.fontTable+xml',
242
'/docProps/core.xml': 'application/vnd.openxmlformats-package.co'
243
're-properties+xml',
244
'/docProps/app.xml': 'application/vnd.openxmlformats-officedocu'
245
'ment.extended-properties+xml',
246
'/word/document.xml': 'application/vnd.openxmlformats-officedocu'
247
'ment.wordprocessingml.document.main+xml',
248
'/word/settings.xml': 'application/vnd.openxmlformats-officedocu'
249
'ment.wordprocessingml.settings+xml',
250
'/word/numbering.xml': 'application/vnd.openxmlformats-officedocu'
251
'ment.wordprocessingml.numbering+xml',
252
'/word/styles.xml': 'application/vnd.openxmlformats-officedocu'
253
'ment.wordprocessingml.styles+xml',
254
'/word/webSettings.xml': 'application/vnd.openxmlformats-officedocu'
255
'ment.wordprocessingml.webSettings+xml'}
256
for part in parts:
257
types.append(makeelement('Override', nsprefix=None,
258
attributes={'PartName': part,
259
'ContentType': parts[part]}))
260
# Add support for filetypes
261
filetypes = {'gif': 'image/gif',
262
'jpeg': 'image/jpeg',
263
'jpg': 'image/jpeg',
264
'png': 'image/png',
265
'rels': 'application/vnd.openxmlformats-package.relationships+xml',
266
'xml': 'application/xml'}
267
for extension in filetypes:
268
types.append(makeelement('Default', nsprefix=None,
269
attributes={'Extension': extension,
270
'ContentType': filetypes[extension]}))
271
return types
272
273
274
def heading(headingtext, headinglevel, lang='en'):
275
'''Make a new heading, return the heading element'''
276
lmap = {'en': 'Heading', 'it': 'Titolo'}
277
# Make our elements
278
paragraph = makeelement('p')
279
pr = makeelement('pPr')
280
pStyle = makeelement('pStyle', attributes={'val': lmap[lang]+str(headinglevel)})
281
run = makeelement('r')
282
text = makeelement('t', tagtext=headingtext)
283
# Add the text the run, and the run to the paragraph
284
pr.append(pStyle)
285
run.append(text)
286
paragraph.append(pr)
287
paragraph.append(run)
288
# Return the combined paragraph
289
return paragraph
290
291
292
def table(contents, heading=True, colw=None, cwunit='dxa', tblw=0, twunit='auto', borders={}, celstyle=None):
293
"""
294
Return a table element based on specified parameters
295
296
@param list contents: A list of lists describing contents. Every item in
297
the list can be a string or a valid XML element
298
itself. It can also be a list. In that case all the
299
listed elements will be merged into the cell.
300
@param bool heading: Tells whether first line should be treated as
301
heading or not
302
@param list colw: list of integer column widths specified in wunitS.
303
@param str cwunit: Unit used for column width:
304
'pct' : fiftieths of a percent
305
'dxa' : twentieths of a point
306
'nil' : no width
307
'auto' : automagically determined
308
@param int tblw: Table width
309
@param int twunit: Unit used for table width. Same possible values as
310
cwunit.
311
@param dict borders: Dictionary defining table border. Supported keys
312
are: 'top', 'left', 'bottom', 'right',
313
'insideH', 'insideV', 'all'.
314
When specified, the 'all' key has precedence over
315
others. Each key must define a dict of border
316
attributes:
317
color : The color of the border, in hex or
318
'auto'
319
space : The space, measured in points
320
sz : The size of the border, in eighths of
321
a point
322
val : The style of the border, see
323
http://www.schemacentral.com/sc/ooxml/t-w_ST_Border.htm
324
@param list celstyle: Specify the style for each colum, list of dicts.
325
supported keys:
326
'align' : specify the alignment, see paragraph
327
documentation.
328
@return lxml.etree: Generated XML etree element
329
"""
330
table = makeelement('tbl')
331
columns = len(contents[0])
332
# Table properties
333
tableprops = makeelement('tblPr')
334
tablestyle = makeelement('tblStyle', attributes={'val': ''})
335
tableprops.append(tablestyle)
336
tablewidth = makeelement('tblW', attributes={'w': str(tblw), 'type': str(twunit)})
337
tableprops.append(tablewidth)
338
if len(borders.keys()):
339
tableborders = makeelement('tblBorders')
340
for b in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
341
if b in borders.keys() or 'all' in borders.keys():
342
k = 'all' if 'all' in borders.keys() else b
343
attrs = {}
344
for a in borders[k].keys():
345
attrs[a] = unicode(borders[k][a])
346
borderelem = makeelement(b, attributes=attrs)
347
tableborders.append(borderelem)
348
tableprops.append(tableborders)
349
tablelook = makeelement('tblLook', attributes={'val': '0400'})
350
tableprops.append(tablelook)
351
table.append(tableprops)
352
# Table Grid
353
tablegrid = makeelement('tblGrid')
354
for i in range(columns):
355
tablegrid.append(makeelement('gridCol', attributes={'w': str(colw[i]) if colw else '2390'}))
356
table.append(tablegrid)
357
# Heading Row
358
row = makeelement('tr')
359
rowprops = makeelement('trPr')
360
cnfStyle = makeelement('cnfStyle', attributes={'val': '000000100000'})
361
rowprops.append(cnfStyle)
362
row.append(rowprops)
363
if heading:
364
i = 0
365
for heading in contents[0]:
366
cell = makeelement('tc')
367
# Cell properties
368
cellprops = makeelement('tcPr')
369
if colw:
370
wattr = {'w': str(colw[i]), 'type': cwunit}
371
else:
372
wattr = {'w': '0', 'type': 'auto'}
373
cellwidth = makeelement('tcW', attributes=wattr)
374
cellstyle = makeelement('shd', attributes={'val': 'clear',
375
'color': 'auto',
376
'fill': 'FFFFFF',
377
'themeFill': 'text2',
378
'themeFillTint': '99'})
379
cellprops.append(cellwidth)
380
cellprops.append(cellstyle)
381
cell.append(cellprops)
382
# Paragraph (Content)
383
if not isinstance(heading, (list, tuple)):
384
heading = [heading]
385
for h in heading:
386
if isinstance(h, etree._Element):
387
cell.append(h)
388
else:
389
cell.append(paragraph(h, jc='center'))
390
row.append(cell)
391
i += 1
392
table.append(row)
393
# Contents Rows
394
for contentrow in contents[1 if heading else 0:]:
395
row = makeelement('tr')
396
i = 0
397
for content in contentrow:
398
cell = makeelement('tc')
399
# Properties
400
cellprops = makeelement('tcPr')
401
if colw:
402
wattr = {'w': str(colw[i]), 'type': cwunit}
403
else:
404
wattr = {'w': '0', 'type': 'auto'}
405
cellwidth = makeelement('tcW', attributes=wattr)
406
cellprops.append(cellwidth)
407
cell.append(cellprops)
408
# Paragraph (Content)
409
if not isinstance(content, (list, tuple)):
410
content = [content]
411
for c in content:
412
if isinstance(c, etree._Element):
413
cell.append(c)
414
else:
415
if celstyle and 'align' in celstyle[i].keys():
416
align = celstyle[i]['align']
417
else:
418
align = 'left'
419
cell.append(paragraph(c, jc=align))
420
row.append(cell)
421
i += 1
422
table.append(row)
423
return table
424
425
426
def picture(relationshiplist, picname, picdescription, pixelwidth=None, pixelheight=None, nochangeaspect=True, nochangearrowheads=True):
427
'''Take a relationshiplist, picture file name, and return a paragraph containing the image
428
and an updated relationshiplist'''
429
# http://openxmldeveloper.org/articles/462.aspx
430
# Create an image. Size may be specified, otherwise it will based on the
431
# pixel size of image. Return a paragraph containing the picture'''
432
# Copy the file into the media dir
433
media_dir = join(template_dir, 'word', 'media')
434
if not os.path.isdir(media_dir):
435
os.mkdir(media_dir)
436
shutil.copyfile(picname, join(media_dir, picname))
437
438
# Check if the user has specified a size
439
if not pixelwidth or not pixelheight:
440
# If not, get info from the picture itself
441
pixelwidth, pixelheight = Image.open(picname).size[0:2]
442
443
# OpenXML measures on-screen objects in English Metric Units
444
# 1cm = 36000 EMUs
445
emuperpixel = 12667
446
width = str(pixelwidth * emuperpixel)
447
height = str(pixelheight * emuperpixel)
448
449
# Set relationship ID to the first available
450
picid = '2'
451
picrelid = 'rId'+str(len(relationshiplist)+1)
452
relationshiplist.append([
453
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
454
'media/'+picname])
455
456
# There are 3 main elements inside a picture
457
# 1. The Blipfill - specifies how the image fills the picture area (stretch, tile, etc.)
458
blipfill = makeelement('blipFill', nsprefix='pic')
459
blipfill.append(makeelement('blip', nsprefix='a', attrnsprefix='r',
460
attributes={'embed': picrelid}))
461
stretch = makeelement('stretch', nsprefix='a')
462
stretch.append(makeelement('fillRect', nsprefix='a'))
463
blipfill.append(makeelement('srcRect', nsprefix='a'))
464
blipfill.append(stretch)
465
466
# 2. The non visual picture properties
467
nvpicpr = makeelement('nvPicPr', nsprefix='pic')
468
cnvpr = makeelement('cNvPr', nsprefix='pic',
469
attributes={'id': '0', 'name': 'Picture 1', 'descr': picname})
470
nvpicpr.append(cnvpr)
471
cnvpicpr = makeelement('cNvPicPr', nsprefix='pic')
472
cnvpicpr.append(makeelement('picLocks', nsprefix='a',
473
attributes={'noChangeAspect': str(int(nochangeaspect)),
474
'noChangeArrowheads': str(int(nochangearrowheads))}))
475
nvpicpr.append(cnvpicpr)
476
477
# 3. The Shape properties
478
sppr = makeelement('spPr', nsprefix='pic', attributes={'bwMode': 'auto'})
479
xfrm = makeelement('xfrm', nsprefix='a')
480
xfrm.append(makeelement('off', nsprefix='a', attributes={'x': '0', 'y': '0'}))
481
xfrm.append(makeelement('ext', nsprefix='a', attributes={'cx': width, 'cy': height}))
482
prstgeom = makeelement('prstGeom', nsprefix='a', attributes={'prst': 'rect'})
483
prstgeom.append(makeelement('avLst', nsprefix='a'))
484
sppr.append(xfrm)
485
sppr.append(prstgeom)
486
487
# Add our 3 parts to the picture element
488
pic = makeelement('pic', nsprefix='pic')
489
pic.append(nvpicpr)
490
pic.append(blipfill)
491
pic.append(sppr)
492
493
# Now make the supporting elements
494
# The following sequence is just: make element, then add its children
495
graphicdata = makeelement('graphicData', nsprefix='a',
496
attributes={'uri': 'http://schemas.openxmlforma'
497
'ts.org/drawingml/2006/picture'})
498
graphicdata.append(pic)
499
graphic = makeelement('graphic', nsprefix='a')
500
graphic.append(graphicdata)
501
502
framelocks = makeelement('graphicFrameLocks', nsprefix='a',
503
attributes={'noChangeAspect': '1'})
504
framepr = makeelement('cNvGraphicFramePr', nsprefix='wp')
505
framepr.append(framelocks)
506
docpr = makeelement('docPr', nsprefix='wp',
507
attributes={'id': picid, 'name': 'Picture 1',
508
'descr': picdescription})
509
effectextent = makeelement('effectExtent', nsprefix='wp',
510
attributes={'l': '25400', 't': '0', 'r': '0',
511
'b': '0'})
512
extent = makeelement('extent', nsprefix='wp',
513
attributes={'cx': width, 'cy': height})
514
inline = makeelement('inline', attributes={'distT': "0", 'distB': "0",
515
'distL': "0", 'distR': "0"},
516
nsprefix='wp')
517
inline.append(extent)
518
inline.append(effectextent)
519
inline.append(docpr)
520
inline.append(framepr)
521
inline.append(graphic)
522
drawing = makeelement('drawing')
523
drawing.append(inline)
524
run = makeelement('r')
525
run.append(drawing)
526
paragraph = makeelement('p')
527
paragraph.append(run)
528
return relationshiplist, paragraph
529
530
531
def search(document, search):
532
'''Search a document for a regex, return success / fail result'''
533
result = False
534
searchre = re.compile(search)
535
for element in document.iter():
536
if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
537
if element.text:
538
if searchre.search(element.text):
539
result = True
540
return result
541
542
543
def replace(document, search, replace):
544
'''Replace all occurences of string with a different string, return updated document'''
545
newdocument = document
546
searchre = re.compile(search)
547
for element in newdocument.iter():
548
if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
549
if element.text:
550
if searchre.search(element.text):
551
element.text = re.sub(search, replace, element.text)
552
return newdocument
553
554
555
def clean(document):
556
""" Perform misc cleaning operations on documents.
557
Returns cleaned document.
558
"""
559
560
newdocument = document
561
562
# Clean empty text and r tags
563
for t in ('t', 'r'):
564
rmlist = []
565
for element in newdocument.iter():
566
if element.tag == '{%s}%s' % (nsprefixes['w'], t):
567
if not element.text and not len(element):
568
rmlist.append(element)
569
for element in rmlist:
570
element.getparent().remove(element)
571
572
return newdocument
573
574
575
def findTypeParent(element, tag):
576
""" Finds fist parent of element of the given type
577
578
@param object element: etree element
579
@param string the tag parent to search for
580
581
@return object element: the found parent or None when not found
582
"""
583
584
p = element
585
while True:
586
p = p.getparent()
587
if p.tag == tag:
588
return p
589
590
# Not found
591
return None
592
593
594
def AdvSearch(document, search, bs=3):
595
'''Return set of all regex matches
596
597
This is an advanced version of python-docx.search() that takes into
598
account blocks of <bs> elements at a time.
599
600
What it does:
601
It searches the entire document body for text blocks.
602
Since the text to search could be spawned across multiple text blocks,
603
we need to adopt some sort of algorithm to handle this situation.
604
The smaller matching group of blocks (up to bs) is then adopted.
605
If the matching group has more than one block, blocks other than first
606
are cleared and all the replacement text is put on first block.
607
608
Examples:
609
original text blocks : [ 'Hel', 'lo,', ' world!' ]
610
search : 'Hello,'
611
output blocks : [ 'Hello,' ]
612
613
original text blocks : [ 'Hel', 'lo', ' __', 'name', '__!' ]
614
search : '(__[a-z]+__)'
615
output blocks : [ '__name__' ]
616
617
@param instance document: The original document
618
@param str search: The text to search for (regexp)
619
append, or a list of etree elements
620
@param int bs: See above
621
622
@return set All occurences of search string
623
624
'''
625
626
# Compile the search regexp
627
searchre = re.compile(search)
628
629
matches = []
630
631
# Will match against searchels. Searchels is a list that contains last
632
# n text elements found in the document. 1 < n < bs
633
searchels = []
634
635
for element in document.iter():
636
if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
637
if element.text:
638
# Add this element to searchels
639
searchels.append(element)
640
if len(searchels) > bs:
641
# Is searchels is too long, remove first elements
642
searchels.pop(0)
643
644
# Search all combinations, of searchels, starting from
645
# smaller up to bigger ones
646
# l = search lenght
647
# s = search start
648
# e = element IDs to merge
649
found = False
650
for l in range(1, len(searchels)+1):
651
if found:
652
break
653
for s in range(len(searchels)):
654
if found:
655
break
656
if s+l <= len(searchels):
657
e = range(s, s+l)
658
txtsearch = ''
659
for k in e:
660
txtsearch += searchels[k].text
661
662
# Searcs for the text in the whole txtsearch
663
match = searchre.search(txtsearch)
664
if match:
665
matches.append(match.group())
666
found = True
667
return set(matches)
668
669
670
def advReplace(document, search, replace, bs=3):
671
"""
672
Replace all occurences of string with a different string, return updated
673
document
674
675
This is a modified version of python-docx.replace() that takes into
676
account blocks of <bs> elements at a time. The replace element can also
677
be a string or an xml etree element.
678
679
What it does:
680
It searches the entire document body for text blocks.
681
Then scan thos text blocks for replace.
682
Since the text to search could be spawned across multiple text blocks,
683
we need to adopt some sort of algorithm to handle this situation.
684
The smaller matching group of blocks (up to bs) is then adopted.
685
If the matching group has more than one block, blocks other than first
686
are cleared and all the replacement text is put on first block.
687
688
Examples:
689
original text blocks : [ 'Hel', 'lo,', ' world!' ]
690
search / replace: 'Hello,' / 'Hi!'
691
output blocks : [ 'Hi!', '', ' world!' ]
692
693
original text blocks : [ 'Hel', 'lo,', ' world!' ]
694
search / replace: 'Hello, world' / 'Hi!'
695
output blocks : [ 'Hi!!', '', '' ]
696
697
original text blocks : [ 'Hel', 'lo,', ' world!' ]
698
search / replace: 'Hel' / 'Hal'
699
output blocks : [ 'Hal', 'lo,', ' world!' ]
700
701
@param instance document: The original document
702
@param str search: The text to search for (regexp)
703
@param mixed replace: The replacement text or lxml.etree element to
704
append, or a list of etree elements
705
@param int bs: See above
706
707
@return instance The document with replacement applied
708
709
"""
710
# Enables debug output
711
DEBUG = False
712
713
newdocument = document
714
715
# Compile the search regexp
716
searchre = re.compile(search)
717
718
# Will match against searchels. Searchels is a list that contains last
719
# n text elements found in the document. 1 < n < bs
720
searchels = []
721
722
for element in newdocument.iter():
723
if element.tag == '{%s}t' % nsprefixes['w']: # t (text) elements
724
if element.text:
725
# Add this element to searchels
726
searchels.append(element)
727
if len(searchels) > bs:
728
# Is searchels is too long, remove first elements
729
searchels.pop(0)
730
731
# Search all combinations, of searchels, starting from
732
# smaller up to bigger ones
733
# l = search lenght
734
# s = search start
735
# e = element IDs to merge
736
found = False
737
for l in range(1, len(searchels)+1):
738
if found:
739
break
740
#print "slen:", l
741
for s in range(len(searchels)):
742
if found:
743
break
744
if s+l <= len(searchels):
745
e = range(s, s+l)
746
#print "elems:", e
747
txtsearch = ''
748
for k in e:
749
txtsearch += searchels[k].text
750
751
# Searcs for the text in the whole txtsearch
752
match = searchre.search(txtsearch)
753
if match:
754
found = True
755
756
# I've found something :)
757
if DEBUG:
758
log.debug("Found element!")
759
log.debug("Search regexp: %s", searchre.pattern)
760
log.debug("Requested replacement: %s", replace)
761
log.debug("Matched text: %s", txtsearch)
762
log.debug("Matched text (splitted): %s", map(lambda i: i.text, searchels))
763
log.debug("Matched at position: %s", match.start())
764
log.debug("matched in elements: %s", e)
765
if isinstance(replace, etree._Element):
766
log.debug("Will replace with XML CODE")
767
elif isinstance(replace(list, tuple)):
768
log.debug("Will replace with LIST OF ELEMENTS")
769
else:
770
log.debug("Will replace with:", re.sub(search, replace, txtsearch))
771
772
curlen = 0
773
replaced = False
774
for i in e:
775
curlen += len(searchels[i].text)
776
if curlen > match.start() and not replaced:
777
# The match occurred in THIS element. Puth in the
778
# whole replaced text
779
if isinstance(replace, etree._Element):
780
# Convert to a list and process it later
781
replace = [replace]
782
if isinstance(replace, (list, tuple)):
783
# I'm replacing with a list of etree elements
784
# clear the text in the tag and append the element after the
785
# parent paragraph
786
# (because t elements cannot have childs)
787
p = findTypeParent(searchels[i], '{%s}p' % nsprefixes['w'])
788
searchels[i].text = re.sub(search, '', txtsearch)
789
insindex = p.getparent().index(p) + 1
790
for r in replace:
791
p.getparent().insert(insindex, r)
792
insindex += 1
793
else:
794
# Replacing with pure text
795
searchels[i].text = re.sub(search, replace, txtsearch)
796
replaced = True
797
log.debug("Replacing in element #: %s", i)
798
else:
799
# Clears the other text elements
800
searchels[i].text = ''
801
return newdocument
802
803
804
def getdocumenttext(document):
805
'''Return the raw text of a document, as a list of paragraphs.'''
806
paratextlist = []
807
# Compile a list of all paragraph (p) elements
808
paralist = []
809
for element in document.iter():
810
# Find p (paragraph) elements
811
if element.tag == '{'+nsprefixes['w']+'}p':
812
paralist.append(element)
813
# Since a single sentence might be spread over multiple text elements, iterate through each
814
# paragraph, appending all text (t) children to that paragraphs text.
815
for para in paralist:
816
paratext = u''
817
# Loop through each paragraph
818
for element in para.iter():
819
# Find t (text) elements
820
if element.tag == '{'+nsprefixes['w']+'}t':
821
if element.text:
822
paratext = paratext+element.text
823
elif element.tag == '{'+nsprefixes['w']+'}tab':
824
paratext = paratext + '\t'
825
# Add our completed paragraph text to the list of paragraph text
826
if not len(paratext) == 0:
827
paratextlist.append(paratext)
828
return paratextlist
829
830
831
def coreproperties(title, subject, creator, keywords, lastmodifiedby=None):
832
'''Create core properties (common document properties referred to in the 'Dublin Core' specification).
833
See appproperties() for other stuff.'''
834
coreprops = makeelement('coreProperties', nsprefix='cp')
835
coreprops.append(makeelement('title', tagtext=title, nsprefix='dc'))
836
coreprops.append(makeelement('subject', tagtext=subject, nsprefix='dc'))
837
coreprops.append(makeelement('creator', tagtext=creator, nsprefix='dc'))
838
coreprops.append(makeelement('keywords', tagtext=','.join(keywords), nsprefix='cp'))
839
if not lastmodifiedby:
840
lastmodifiedby = creator
841
coreprops.append(makeelement('lastModifiedBy', tagtext=lastmodifiedby, nsprefix='cp'))
842
coreprops.append(makeelement('revision', tagtext='1', nsprefix='cp'))
843
coreprops.append(makeelement('category', tagtext='Examples', nsprefix='cp'))
844
coreprops.append(makeelement('description', tagtext='Examples', nsprefix='dc'))
845
currenttime = time.strftime('%Y-%m-%dT%H:%M:%SZ')
846
# Document creation and modify times
847
# Prob here: we have an attribute who name uses one namespace, and that
848
# attribute's value uses another namespace.
849
# We're creating the element from a string as a workaround...
850
for doctime in ['created', 'modified']:
851
coreprops.append(etree.fromstring('''<dcterms:'''+doctime+''' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dcterms="http://purl.org/dc/terms/" xsi:type="dcterms:W3CDTF">'''+currenttime+'''</dcterms:'''+doctime+'''>'''))
852
pass
853
return coreprops
854
855
856
def appproperties():
857
"""
858
Create app-specific properties. See docproperties() for more common
859
document properties.
860
861
"""
862
appprops = makeelement('Properties', nsprefix='ep')
863
appprops = etree.fromstring(
864
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Properties x'
865
'mlns="http://schemas.openxmlformats.org/officeDocument/2006/extended'
866
'-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocum'
867
'ent/2006/docPropsVTypes"></Properties>')
868
props =\
869
{'Template': 'Normal.dotm',
870
'TotalTime': '6',
871
'Pages': '1',
872
'Words': '83',
873
'Characters': '475',
874
'Application': 'Microsoft Word 12.0.0',
875
'DocSecurity': '0',
876
'Lines': '12',
877
'Paragraphs': '8',
878
'ScaleCrop': 'false',
879
'LinksUpToDate': 'false',
880
'CharactersWithSpaces': '583',
881
'SharedDoc': 'false',
882
'HyperlinksChanged': 'false',
883
'AppVersion': '12.0000'}
884
for prop in props:
885
appprops.append(makeelement(prop, tagtext=props[prop], nsprefix=None))
886
return appprops
887
888
889
def websettings():
890
'''Generate websettings'''
891
web = makeelement('webSettings')
892
web.append(makeelement('allowPNG'))
893
web.append(makeelement('doNotSaveAsSingleFile'))
894
return web
895
896
897
def relationshiplist():
898
relationshiplist =\
899
[['http://schemas.openxmlformats.org/officeDocument/2006/'
900
'relationships/numbering', 'numbering.xml'],
901
['http://schemas.openxmlformats.org/officeDocument/2006/'
902
'relationships/styles', 'styles.xml'],
903
['http://schemas.openxmlformats.org/officeDocument/2006/'
904
'relationships/settings', 'settings.xml'],
905
['http://schemas.openxmlformats.org/officeDocument/2006/'
906
'relationships/webSettings', 'webSettings.xml'],
907
['http://schemas.openxmlformats.org/officeDocument/2006/'
908
'relationships/fontTable', 'fontTable.xml'],
909
['http://schemas.openxmlformats.org/officeDocument/2006/'
910
'relationships/theme', 'theme/theme1.xml']]
911
return relationshiplist
912
913
914
def wordrelationships(relationshiplist):
915
'''Generate a Word relationships file'''
916
# Default list of relationships
917
# FIXME: using string hack instead of making element
918
#relationships = makeelement('Relationships', nsprefix='pr')
919
relationships = etree.fromstring(
920
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006'
921
'/relationships"></Relationships>')
922
count = 0
923
for relationship in relationshiplist:
924
# Relationship IDs (rId) start at 1.
925
rel_elm = makeelement('Relationship', nsprefix=None,
926
attributes={'Id': 'rId'+str(count+1),
927
'Type': relationship[0],
928
'Target': relationship[1]}
929
)
930
relationships.append(rel_elm)
931
count += 1
932
return relationships
933
934
935
def savedocx(document, coreprops, appprops, contenttypes, websettings, wordrelationships, output):
936
'''Save a modified document'''
937
assert os.path.isdir(template_dir)
938
docxfile = zipfile.ZipFile(output, mode='w', compression=zipfile.ZIP_DEFLATED)
939
940
# Move to the template data path
941
prev_dir = os.path.abspath('.') # save previous working dir
942
os.chdir(template_dir)
943
944
# Serialize our trees into out zip file
945
treesandfiles = {document: 'word/document.xml',
946
coreprops: 'docProps/core.xml',
947
appprops: 'docProps/app.xml',
948
contenttypes: '[Content_Types].xml',
949
websettings: 'word/webSettings.xml',
950
wordrelationships: 'word/_rels/document.xml.rels'}
951
for tree in treesandfiles:
952
log.info('Saving: %s' % treesandfiles[tree])
953
treestring = etree.tostring(tree, pretty_print=True)
954
docxfile.writestr(treesandfiles[tree], treestring)
955
956
# Add & compress support files
957
files_to_ignore = ['.DS_Store'] # nuisance from some os's
958
for dirpath, dirnames, filenames in os.walk('.'):
959
for filename in filenames:
960
if filename in files_to_ignore:
961
continue
962
templatefile = join(dirpath, filename)
963
archivename = templatefile[2:]
964
log.info('Saving: %s', archivename)
965
docxfile.write(templatefile, archivename)
966
log.info('Saved new file to: %r', output)
967
docxfile.close()
968
os.chdir(prev_dir) # restore previous working dir
969
return
970
971
972
import sys
973
def main():
974
try:
975
if len(sys.argv) == 2:
976
base = os.path.splitext(sys.argv[1])[0]
977
newfilename = base + '.txt'
978
else:
979
newfilename = sys.argv[2]
980
if os.path.exists(newfilename):
981
print("WARNING: %s already exists; doing nothing."%newfilename)
982
sys.exit(0)
983
document = opendocx(sys.argv[1])
984
newfile = open(newfilename,'w')
985
except IndexError:
986
print('Please supply an input and output file. For example:')
987
print(''' example-extracttext.py 'My Office 2007 document.docx' 'outputfile.txt' ''')
988
exit()
989
## Fetch all the text out of the document we just created
990
paratextlist = getdocumenttext(document)
991
992
# Make explicit unicode version
993
newparatextlist = []
994
for paratext in paratextlist:
995
newparatextlist.append(paratext.encode("utf-8"))
996
997
## Print our documnts test with two newlines under each paragraph
998
newfile.write('\n\n'.join(newparatextlist))
999
#print '\n\n'.join(newparatextlist)
1000
1001
if __name__ == '__main__':
1002
main()
1003
1004