CoCalc -- docx2txt.py

Path: cocalc/src / smc_pyutil / smc_pyutil / docx2txt.py
Views: ³⁹⁵⁶⁰
1
#!/usr/bin/python
2
###############################################################################
3
#
4
#    CoCalc: Collaborative Calculation in the Cloud
5
#
6
#    Copyright (C) 2016, Sagemath Inc.
7
#
8
#    This program is free software: you can redistribute it and/or modify
9
#    it under the terms of the GNU General Public License as published by
10
#    the Free Software Foundation, either version 3 of the License, or
11
#    (at your option) any later version.
12
#
13
#    This program is distributed in the hope that it will be useful,
14
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
15
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
#    GNU General Public License for more details.
17
#
18
#    You should have received a copy of the GNU General Public License
19
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
20
#
21
###############################################################################
22

23

24

25
# -*- coding: utf-8 -*-
26
"""
27
Open and modify Microsoft Word 2007 docx files (called 'OpenXML' and
28
'Office OpenXML' by Microsoft)
29

30
Part of Python's docx module - http://github.com/mikemaccana/python-docx
31
See LICENSE for licensing information.
32
"""
33

34
import logging
35
from lxml import etree
36
try:
37
    from PIL import Image
38
except ImportError:
39
    import Image
40
import zipfile
41
import shutil
42
import re
43
import time
44
import os
45
from os.path import join
46

47
log = logging.getLogger(__name__)
48

49
# Record template directory's location which is just 'template' for a docx
50
# developer or 'site-packages/docx-template' if you have installed docx
51
template_dir = join(os.path.dirname(__file__), 'docx-template')  # installed
52
if not os.path.isdir(template_dir):
53
    template_dir = join(os.path.dirname(__file__), 'template')  # dev
54

55
# All Word prefixes / namespace matches used in document.xml & core.xml.
56
# LXML doesn't actually use prefixes (just the real namespace) , but these
57
# make it easier to copy Word output more easily.
58
nsprefixes = {
59
    'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
60
    'o':  'urn:schemas-microsoft-com:office:office',
61
    've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
62
    # Text Content
63
    'w':   'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
64
    'w10': 'urn:schemas-microsoft-com:office:word',
65
    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
66
    # Drawing
67
    'a':   'http://schemas.openxmlformats.org/drawingml/2006/main',
68
    'm':   'http://schemas.openxmlformats.org/officeDocument/2006/math',
69
    'mv':  'urn:schemas-microsoft-com:mac:vml',
70
    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
71
    'v':   'urn:schemas-microsoft-com:vml',
72
    'wp':  'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
73
    # Properties (core and extended)
74
    'cp':  'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
75
    'dc':  'http://purl.org/dc/elements/1.1/',
76
    'ep':  'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
77
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
78
    # Content Types
79
    'ct':  'http://schemas.openxmlformats.org/package/2006/content-types',
80
    # Package Relationships
81
    'r':   'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
82
    'pr':  'http://schemas.openxmlformats.org/package/2006/relationships',
83
    # Dublin Core document properties
84
    'dcmitype': 'http://purl.org/dc/dcmitype/',
85
    'dcterms':  'http://purl.org/dc/terms/'}
86

87

88
def opendocx(file):
89
    '''Open a docx file, return a document XML tree'''
90
    mydoc = zipfile.ZipFile(file)
91
    xmlcontent = mydoc.read('word/document.xml')
92
    document = etree.fromstring(xmlcontent)
93
    return document
94

95

96
def newdocument():
97
    document = makeelement('document')
98
    document.append(makeelement('body'))
99
    return document
100

101

102
def makeelement(tagname, tagtext=None, nsprefix='w', attributes=None, attrnsprefix=None):
103
    '''Create an element & return it'''
104
    # Deal with list of nsprefix by making namespacemap
105
    namespacemap = None
106
    if isinstance(nsprefix, list):
107
        namespacemap = {}
108
        for prefix in nsprefix:
109
            namespacemap[prefix] = nsprefixes[prefix]
110
        # FIXME: rest of code below expects a single prefix
111
        nsprefix = nsprefix[0]
112
    if nsprefix:
113
        namespace = '{'+nsprefixes[nsprefix]+'}'
114
    else:
115
        # For when namespace = None
116
        namespace = ''
117
    newelement = etree.Element(namespace+tagname, nsmap=namespacemap)
118
    # Add attributes with namespaces
119
    if attributes:
120
        # If they haven't bothered setting attribute namespace, use an empty string
121
        # (equivalent of no namespace)
122
        if not attrnsprefix:
123
            # Quick hack: it seems every element that has a 'w' nsprefix for its tag uses the same prefix for it's attributes
124
            if nsprefix == 'w':
125
                attributenamespace = namespace
126
            else:
127
                attributenamespace = ''
128
        else:
129
            attributenamespace = '{'+nsprefixes[attrnsprefix]+'}'
130

131
        for tagattribute in attributes:
132
            newelement.set(attributenamespace+tagattribute, attributes[tagattribute])
133
    if tagtext:
134
        newelement.text = tagtext
135
    return newelement
136

137

138
def pagebreak(type='page', orient='portrait'):
139
    '''Insert a break, default 'page'.
140
    See http://openxmldeveloper.org/forums/thread/4075.aspx
141
    Return our page break element.'''
142
    # Need to enumerate different types of page breaks.
143
    validtypes = ['page', 'section']
144
    if type not in validtypes:
145
        tmpl = 'Page break style "%s" not implemented. Valid styles: %s.'
146
        raise ValueError(tmpl % (type, validtypes))
147
    pagebreak = makeelement('p')
148
    if type == 'page':
149
        run = makeelement('r')
150
        br = makeelement('br', attributes={'type': type})
151
        run.append(br)
152
        pagebreak.append(run)
153
    elif type == 'section':
154
        pPr = makeelement('pPr')
155
        sectPr = makeelement('sectPr')
156
        if orient == 'portrait':
157
            pgSz = makeelement('pgSz', attributes={'w': '12240', 'h': '15840'})
158
        elif orient == 'landscape':
159
            pgSz = makeelement('pgSz', attributes={'h': '12240', 'w': '15840',
160
                                                   'orient': 'landscape'})
161
        sectPr.append(pgSz)
162
        pPr.append(sectPr)
163
        pagebreak.append(pPr)
164
    return pagebreak
165

166

167
def paragraph(paratext, style='BodyText', breakbefore=False, jc='left'):
168
    '''Make a new paragraph element, containing a run, and some text.
169
    Return the paragraph element.
170

171
    @param string jc: Paragraph alignment, possible values:
172
                      left, center, right, both (justified), ...
173
                      see http://www.schemacentral.com/sc/ooxml/t-w_ST_Jc.html
174
                      for a full list
175

176
    If paratext is a list, spawn multiple run/text elements.
177
    Support text styles (paratext must then be a list of lists in the form
178
    <text> / <style>. Stile is a string containing a combination od 'bui' chars
179

180
    example
181
    paratext =\
182
        [ ('some bold text', 'b')
183
        , ('some normal text', '')
184
        , ('some italic underlined text', 'iu')
185
        ]
186

187
    '''
188
    # Make our elements
189
    paragraph = makeelement('p')
190

191
    if isinstance(paratext, list):
192
        text = []
193
        for pt in paratext:
194
            if isinstance(pt, (list, tuple)):
195
                text.append([makeelement('t', tagtext=pt[0]), pt[1]])
196
            else:
197
                text.append([makeelement('t', tagtext=pt), ''])
198
    else:
199
        text = [[makeelement('t', tagtext=paratext), ''], ]
200
    pPr = makeelement('pPr')
201
    pStyle = makeelement('pStyle', attributes={'val': style})
202
    pJc = makeelement('jc', attributes={'val': jc})
203
    pPr.append(pStyle)
204
    pPr.append(pJc)
205

206
    # Add the text the run, and the run to the paragraph
207
    paragraph.append(pPr)
208
    for t in text:
209
        run = makeelement('r')
210
        rPr = makeelement('rPr')
211
        # Apply styles
212
        if t[1].find('b') > -1:
213
            b = makeelement('b')
214
            rPr.append(b)
215
        if t[1].find('u') > -1:
216
            u = makeelement('u', attributes={'val': 'single'})
217
            rPr.append(u)
218
        if t[1].find('i') > -1:
219
            i = makeelement('i')
220
            rPr.append(i)
221
        run.append(rPr)
222
        # Insert lastRenderedPageBreak for assistive technologies like
223
        # document narrators to know when a page break occurred.
224
        if breakbefore:
225
            lastRenderedPageBreak = makeelement('lastRenderedPageBreak')
226
            run.append(lastRenderedPageBreak)
227
        run.append(t[0])
228
        paragraph.append(run)
229
    # Return the combined paragraph
230
    return paragraph
231

232

233
def contenttypes():
234
    types = etree.fromstring(
235
        '<Types xmlns="http://schemas.openxmlformats.org/package/2006/conten'
236
        't-types"></Types>')
237
    parts = {
238
        '/word/theme/theme1.xml': 'application/vnd.openxmlformats-officedocu'
239
                                  'ment.theme+xml',
240
        '/word/fontTable.xml':    'application/vnd.openxmlformats-officedocu'
241
                                  'ment.wordprocessingml.fontTable+xml',
242
        '/docProps/core.xml':     'application/vnd.openxmlformats-package.co'
243
                                  're-properties+xml',
244
        '/docProps/app.xml':      'application/vnd.openxmlformats-officedocu'
245
                                  'ment.extended-properties+xml',
246
        '/word/document.xml':     'application/vnd.openxmlformats-officedocu'
247
                                  'ment.wordprocessingml.document.main+xml',
248
        '/word/settings.xml':     'application/vnd.openxmlformats-officedocu'
249
                                  'ment.wordprocessingml.settings+xml',
250
        '/word/numbering.xml':    'application/vnd.openxmlformats-officedocu'
251
                                  'ment.wordprocessingml.numbering+xml',
252
        '/word/styles.xml':       'application/vnd.openxmlformats-officedocu'
253
                                  'ment.wordprocessingml.styles+xml',
254
        '/word/webSettings.xml':  'application/vnd.openxmlformats-officedocu'
255
                                  'ment.wordprocessingml.webSettings+xml'}
256
    for part in parts:
257
        types.append(makeelement('Override', nsprefix=None,
258
                                 attributes={'PartName': part,
259
                                             'ContentType': parts[part]}))
260
    # Add support for filetypes
261
    filetypes = {'gif':  'image/gif',
262
                 'jpeg': 'image/jpeg',
263
                 'jpg':  'image/jpeg',
264
                 'png':  'image/png',
265
                 'rels': 'application/vnd.openxmlformats-package.relationships+xml',
266
                 'xml':  'application/xml'}
267
    for extension in filetypes:
268
        types.append(makeelement('Default', nsprefix=None,
269
                                 attributes={'Extension': extension,
270
                                             'ContentType': filetypes[extension]}))
271
    return types
272

273

274
def heading(headingtext, headinglevel, lang='en'):
275
    '''Make a new heading, return the heading element'''
276
    lmap = {'en': 'Heading', 'it': 'Titolo'}
277
    # Make our elements
278
    paragraph = makeelement('p')
279
    pr = makeelement('pPr')
280
    pStyle = makeelement('pStyle', attributes={'val': lmap[lang]+str(headinglevel)})
281
    run = makeelement('r')
282
    text = makeelement('t', tagtext=headingtext)
283
    # Add the text the run, and the run to the paragraph
284
    pr.append(pStyle)
285
    run.append(text)
286
    paragraph.append(pr)
287
    paragraph.append(run)
288
    # Return the combined paragraph
289
    return paragraph
290

291

292
def table(contents, heading=True, colw=None, cwunit='dxa', tblw=0, twunit='auto', borders={}, celstyle=None):
293
    """
294
    Return a table element based on specified parameters
295

296
    @param list contents: A list of lists describing contents. Every item in
297
                          the list can be a string or a valid XML element
298
                          itself. It can also be a list. In that case all the
299
                          listed elements will be merged into the cell.
300
    @param bool heading:  Tells whether first line should be treated as
301
                          heading or not
302
    @param list colw:     list of integer column widths specified in wunitS.
303
    @param str  cwunit:   Unit used for column width:
304
                            'pct'  : fiftieths of a percent
305
                            'dxa'  : twentieths of a point
306
                            'nil'  : no width
307
                            'auto' : automagically determined
308
    @param int  tblw:     Table width
309
    @param int  twunit:   Unit used for table width. Same possible values as
310
                          cwunit.
311
    @param dict borders:  Dictionary defining table border. Supported keys
312
                          are: 'top', 'left', 'bottom', 'right',
313
                          'insideH', 'insideV', 'all'.
314
                          When specified, the 'all' key has precedence over
315
                          others. Each key must define a dict of border
316
                          attributes:
317
                            color : The color of the border, in hex or
318
                                    'auto'
319
                            space : The space, measured in points
320
                            sz    : The size of the border, in eighths of
321
                                    a point
322
                            val   : The style of the border, see
323
                http://www.schemacentral.com/sc/ooxml/t-w_ST_Border.htm
324
    @param list celstyle: Specify the style for each colum, list of dicts.
325
                          supported keys:
326
                          'align' : specify the alignment, see paragraph
327
                                    documentation.
328
    @return lxml.etree:   Generated XML etree element
329
    """
330
    table = makeelement('tbl')
331
    columns = len(contents[0])
332
    # Table properties
333
    tableprops = makeelement('tblPr')
334
    tablestyle = makeelement('tblStyle', attributes={'val': ''})
335
    tableprops.append(tablestyle)
336
    tablewidth = makeelement('tblW', attributes={'w': str(tblw), 'type': str(twunit)})
337
    tableprops.append(tablewidth)
338
    if len(borders.keys()):
339
        tableborders = makeelement('tblBorders')
340
        for b in ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']:
341
            if b in borders.keys() or 'all' in borders.keys():
342
                k = 'all' if 'all' in borders.keys() else b
343
                attrs = {}
344
                for a in borders[k].keys():
345
                    attrs[a] = unicode(borders[k][a])
346
                borderelem = makeelement(b, attributes=attrs)
347
                tableborders.append(borderelem)
348
        tableprops.append(tableborders)
349
    tablelook = makeelement('tblLook', attributes={'val': '0400'})
350
    tableprops.append(tablelook)
351
    table.append(tableprops)
352
    # Table Grid
353
    tablegrid = makeelement('tblGrid')
354
    for i in range(columns):
355
        tablegrid.append(makeelement('gridCol', attributes={'w': str(colw[i]) if colw else '2390'}))
356
    table.append(tablegrid)
357
    # Heading Row
358
    row = makeelement('tr')
359
    rowprops = makeelement('trPr')
360
    cnfStyle = makeelement('cnfStyle', attributes={'val': '000000100000'})
361
    rowprops.append(cnfStyle)
362
    row.append(rowprops)
363
    if heading:
364
        i = 0
365
        for heading in contents[0]:
366
            cell = makeelement('tc')
367
            # Cell properties
368
            cellprops = makeelement('tcPr')
369
            if colw:
370
                wattr = {'w': str(colw[i]), 'type': cwunit}
371
            else:
372
                wattr = {'w': '0', 'type': 'auto'}
373
            cellwidth = makeelement('tcW', attributes=wattr)
374
            cellstyle = makeelement('shd', attributes={'val': 'clear',
375
                                                       'color': 'auto',
376
                                                       'fill': 'FFFFFF',
377
                                                       'themeFill': 'text2',
378
                                                       'themeFillTint': '99'})
379
            cellprops.append(cellwidth)
380
            cellprops.append(cellstyle)
381
            cell.append(cellprops)
382
            # Paragraph (Content)
383
            if not isinstance(heading, (list, tuple)):
384
                heading = [heading]
385
            for h in heading:
386
                if isinstance(h, etree._Element):
387
                    cell.append(h)
388
                else:
389
                    cell.append(paragraph(h, jc='center'))
390
            row.append(cell)
391
            i += 1
392
        table.append(row)
393
    # Contents Rows
394
    for contentrow in contents[1 if heading else 0:]:
395
        row = makeelement('tr')
396
        i = 0
397
        for content in contentrow:
398
            cell = makeelement('tc')
399
            # Properties
400
            cellprops = makeelement('tcPr')
401
            if colw:
402
                wattr = {'w': str(colw[i]), 'type': cwunit}
403
            else:
404
                wattr = {'w': '0', 'type': 'auto'}
405
            cellwidth = makeelement('tcW', attributes=wattr)
406
            cellprops.append(cellwidth)
407
            cell.append(cellprops)
408
            # Paragraph (Content)
409
            if not isinstance(content, (list, tuple)):
410
                content = [content]
411
            for c in content:
412
                if isinstance(c, etree._Element):
413
                    cell.append(c)
414
                else:
415
                    if celstyle and 'align' in celstyle[i].keys():
416
                        align = celstyle[i]['align']
417
                    else:
418
                        align = 'left'
419
                    cell.append(paragraph(c, jc=align))
420
            row.append(cell)
421
            i += 1
422
        table.append(row)
423
    return table
424

425

426
def picture(relationshiplist, picname, picdescription, pixelwidth=None, pixelheight=None, nochangeaspect=True, nochangearrowheads=True):
427
    '''Take a relationshiplist, picture file name, and return a paragraph containing the image
428
    and an updated relationshiplist'''
429
    # http://openxmldeveloper.org/articles/462.aspx
430
    # Create an image. Size may be specified, otherwise it will based on the
431
    # pixel size of image. Return a paragraph containing the picture'''
432
    # Copy the file into the media dir
433
    media_dir = join(template_dir, 'word', 'media')
434
    if not os.path.isdir(media_dir):
435
        os.mkdir(media_dir)
436
    shutil.copyfile(picname, join(media_dir, picname))
437

438
    # Check if the user has specified a size
439
    if not pixelwidth or not pixelheight:
440
        # If not, get info from the picture itself
441
        pixelwidth, pixelheight = Image.open(picname).size[0:2]
442

443
    # OpenXML measures on-screen objects in English Metric Units
444
    # 1cm = 36000 EMUs
445
    emuperpixel = 12667
446
    width = str(pixelwidth * emuperpixel)
447
    height = str(pixelheight * emuperpixel)
448

449
    # Set relationship ID to the first available
450
    picid = '2'
451
    picrelid = 'rId'+str(len(relationshiplist)+1)
452
    relationshiplist.append([
453
        'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
454
        'media/'+picname])
455

456
    # There are 3 main elements inside a picture
457
    # 1. The Blipfill - specifies how the image fills the picture area (stretch, tile, etc.)
458
    blipfill = makeelement('blipFill', nsprefix='pic')
459
    blipfill.append(makeelement('blip', nsprefix='a', attrnsprefix='r',
460
                    attributes={'embed': picrelid}))
461
    stretch = makeelement('stretch', nsprefix='a')
462
    stretch.append(makeelement('fillRect', nsprefix='a'))
463
    blipfill.append(makeelement('srcRect', nsprefix='a'))
464
    blipfill.append(stretch)
465

466
    # 2. The non visual picture properties
467
    nvpicpr = makeelement('nvPicPr', nsprefix='pic')
468
    cnvpr = makeelement('cNvPr', nsprefix='pic',
469
                        attributes={'id': '0', 'name': 'Picture 1', 'descr': picname})
470
    nvpicpr.append(cnvpr)
471
    cnvpicpr = makeelement('cNvPicPr', nsprefix='pic')
472
    cnvpicpr.append(makeelement('picLocks', nsprefix='a',
473
                    attributes={'noChangeAspect': str(int(nochangeaspect)),
474
                                'noChangeArrowheads': str(int(nochangearrowheads))}))
475
    nvpicpr.append(cnvpicpr)
476

477
    # 3. The Shape properties
478
    sppr = makeelement('spPr', nsprefix='pic', attributes={'bwMode': 'auto'})
479
    xfrm = makeelement('xfrm', nsprefix='a')
480
    xfrm.append(makeelement('off', nsprefix='a', attributes={'x': '0', 'y': '0'}))
481
    xfrm.append(makeelement('ext', nsprefix='a', attributes={'cx': width, 'cy': height}))
482
    prstgeom = makeelement('prstGeom', nsprefix='a', attributes={'prst': 'rect'})
483
    prstgeom.append(makeelement('avLst', nsprefix='a'))
484
    sppr.append(xfrm)
485
    sppr.append(prstgeom)
486

487
    # Add our 3 parts to the picture element
488
    pic = makeelement('pic', nsprefix='pic')
489
    pic.append(nvpicpr)
490
    pic.append(blipfill)
491
    pic.append(sppr)
492

493
    # Now make the supporting elements
494
    # The following sequence is just: make element, then add its children
495
    graphicdata = makeelement('graphicData', nsprefix='a',
496
                              attributes={'uri': 'http://schemas.openxmlforma'
497
                                                 'ts.org/drawingml/2006/picture'})
498
    graphicdata.append(pic)
499
    graphic = makeelement('graphic', nsprefix='a')
500
    graphic.append(graphicdata)
501

502
    framelocks = makeelement('graphicFrameLocks', nsprefix='a',
503
                             attributes={'noChangeAspect': '1'})
504
    framepr = makeelement('cNvGraphicFramePr', nsprefix='wp')
505
    framepr.append(framelocks)
506
    docpr = makeelement('docPr', nsprefix='wp',
507
                        attributes={'id': picid, 'name': 'Picture 1',
508
                                    'descr': picdescription})
509
    effectextent = makeelement('effectExtent', nsprefix='wp',
510
                               attributes={'l': '25400', 't': '0', 'r': '0',
511
                                           'b': '0'})
512
    extent = makeelement('extent', nsprefix='wp',
513
                         attributes={'cx': width, 'cy': height})
514
    inline = makeelement('inline', attributes={'distT': "0", 'distB': "0",
515
                                               'distL': "0", 'distR': "0"},
516
                         nsprefix='wp')
517
    inline.append(extent)
518
    inline.append(effectextent)
519
    inline.append(docpr)
520
    inline.append(framepr)
521
    inline.append(graphic)
522
    drawing = makeelement('drawing')
523
    drawing.append(inline)
524
    run = makeelement('r')
525
    run.append(drawing)
526
    paragraph = makeelement('p')
527
    paragraph.append(run)
528
    return relationshiplist, paragraph
529

530

531
def search(document, search):
532
    '''Search a document for a regex, return success / fail result'''
533
    result = False
534
    searchre = re.compile(search)
535
    for element in document.iter():
536
        if element.tag == '{%s}t' % nsprefixes['w']:  # t (text) elements
537
            if element.text:
538
                if searchre.search(element.text):
539
                    result = True
540
    return result
541

542

543
def replace(document, search, replace):
544
    '''Replace all occurences of string with a different string, return updated document'''
545
    newdocument = document
546
    searchre = re.compile(search)
547
    for element in newdocument.iter():
548
        if element.tag == '{%s}t' % nsprefixes['w']:  # t (text) elements
549
            if element.text:
550
                if searchre.search(element.text):
551
                    element.text = re.sub(search, replace, element.text)
552
    return newdocument
553

554

555
def clean(document):
556
    """ Perform misc cleaning operations on documents.
557
        Returns cleaned document.
558
    """
559

560
    newdocument = document
561

562
    # Clean empty text and r tags
563
    for t in ('t', 'r'):
564
        rmlist = []
565
        for element in newdocument.iter():
566
            if element.tag == '{%s}%s' % (nsprefixes['w'], t):
567
                if not element.text and not len(element):
568
                    rmlist.append(element)
569
        for element in rmlist:
570
            element.getparent().remove(element)
571

572
    return newdocument
573

574

575
def findTypeParent(element, tag):
576
    """ Finds fist parent of element of the given type
577

578
    @param object element: etree element
579
    @param string the tag parent to search for
580

581
    @return object element: the found parent or None when not found
582
    """
583

584
    p = element
585
    while True:
586
        p = p.getparent()
587
        if p.tag == tag:
588
            return p
589

590
    # Not found
591
    return None
592

593

594
def AdvSearch(document, search, bs=3):
595
    '''Return set of all regex matches
596

597
    This is an advanced version of python-docx.search() that takes into
598
    account blocks of <bs> elements at a time.
599

600
    What it does:
601
    It searches the entire document body for text blocks.
602
    Since the text to search could be spawned across multiple text blocks,
603
    we need to adopt some sort of algorithm to handle this situation.
604
    The smaller matching group of blocks (up to bs) is then adopted.
605
    If the matching group has more than one block, blocks other than first
606
    are cleared and all the replacement text is put on first block.
607

608
    Examples:
609
    original text blocks : [ 'Hel', 'lo,', ' world!' ]
610
    search : 'Hello,'
611
    output blocks : [ 'Hello,' ]
612

613
    original text blocks : [ 'Hel', 'lo', ' __', 'name', '__!' ]
614
    search : '(__[a-z]+__)'
615
    output blocks : [ '__name__' ]
616

617
    @param instance  document: The original document
618
    @param str       search: The text to search for (regexp)
619
                          append, or a list of etree elements
620
    @param int       bs: See above
621

622
    @return set      All occurences of search string
623

624
    '''
625

626
    # Compile the search regexp
627
    searchre = re.compile(search)
628

629
    matches = []
630

631
    # Will match against searchels. Searchels is a list that contains last
632
    # n text elements found in the document. 1 < n < bs
633
    searchels = []
634

635
    for element in document.iter():
636
        if element.tag == '{%s}t' % nsprefixes['w']:  # t (text) elements
637
            if element.text:
638
                # Add this element to searchels
639
                searchels.append(element)
640
                if len(searchels) > bs:
641
                    # Is searchels is too long, remove first elements
642
                    searchels.pop(0)
643

644
                # Search all combinations, of searchels, starting from
645
                # smaller up to bigger ones
646
                # l = search lenght
647
                # s = search start
648
                # e = element IDs to merge
649
                found = False
650
                for l in range(1, len(searchels)+1):
651
                    if found:
652
                        break
653
                    for s in range(len(searchels)):
654
                        if found:
655
                            break
656
                        if s+l <= len(searchels):
657
                            e = range(s, s+l)
658
                            txtsearch = ''
659
                            for k in e:
660
                                txtsearch += searchels[k].text
661

662
                            # Searcs for the text in the whole txtsearch
663
                            match = searchre.search(txtsearch)
664
                            if match:
665
                                matches.append(match.group())
666
                                found = True
667
    return set(matches)
668

669

670
def advReplace(document, search, replace, bs=3):
671
    """
672
    Replace all occurences of string with a different string, return updated
673
    document
674

675
    This is a modified version of python-docx.replace() that takes into
676
    account blocks of <bs> elements at a time. The replace element can also
677
    be a string or an xml etree element.
678

679
    What it does:
680
    It searches the entire document body for text blocks.
681
    Then scan thos text blocks for replace.
682
    Since the text to search could be spawned across multiple text blocks,
683
    we need to adopt some sort of algorithm to handle this situation.
684
    The smaller matching group of blocks (up to bs) is then adopted.
685
    If the matching group has more than one block, blocks other than first
686
    are cleared and all the replacement text is put on first block.
687

688
    Examples:
689
    original text blocks : [ 'Hel', 'lo,', ' world!' ]
690
    search / replace: 'Hello,' / 'Hi!'
691
    output blocks : [ 'Hi!', '', ' world!' ]
692

693
    original text blocks : [ 'Hel', 'lo,', ' world!' ]
694
    search / replace: 'Hello, world' / 'Hi!'
695
    output blocks : [ 'Hi!!', '', '' ]
696

697
    original text blocks : [ 'Hel', 'lo,', ' world!' ]
698
    search / replace: 'Hel' / 'Hal'
699
    output blocks : [ 'Hal', 'lo,', ' world!' ]
700

701
    @param instance  document: The original document
702
    @param str       search: The text to search for (regexp)
703
    @param mixed     replace: The replacement text or lxml.etree element to
704
                         append, or a list of etree elements
705
    @param int       bs: See above
706

707
    @return instance The document with replacement applied
708

709
    """
710
    # Enables debug output
711
    DEBUG = False
712

713
    newdocument = document
714

715
    # Compile the search regexp
716
    searchre = re.compile(search)
717

718
    # Will match against searchels. Searchels is a list that contains last
719
    # n text elements found in the document. 1 < n < bs
720
    searchels = []
721

722
    for element in newdocument.iter():
723
        if element.tag == '{%s}t' % nsprefixes['w']:  # t (text) elements
724
            if element.text:
725
                # Add this element to searchels
726
                searchels.append(element)
727
                if len(searchels) > bs:
728
                    # Is searchels is too long, remove first elements
729
                    searchels.pop(0)
730

731
                # Search all combinations, of searchels, starting from
732
                # smaller up to bigger ones
733
                # l = search lenght
734
                # s = search start
735
                # e = element IDs to merge
736
                found = False
737
                for l in range(1, len(searchels)+1):
738
                    if found:
739
                        break
740
                    #print "slen:", l
741
                    for s in range(len(searchels)):
742
                        if found:
743
                            break
744
                        if s+l <= len(searchels):
745
                            e = range(s, s+l)
746
                            #print "elems:", e
747
                            txtsearch = ''
748
                            for k in e:
749
                                txtsearch += searchels[k].text
750

751
                            # Searcs for the text in the whole txtsearch
752
                            match = searchre.search(txtsearch)
753
                            if match:
754
                                found = True
755

756
                                # I've found something :)
757
                                if DEBUG:
758
                                    log.debug("Found element!")
759
                                    log.debug("Search regexp: %s", searchre.pattern)
760
                                    log.debug("Requested replacement: %s", replace)
761
                                    log.debug("Matched text: %s", txtsearch)
762
                                    log.debug("Matched text (splitted): %s", map(lambda i: i.text, searchels))
763
                                    log.debug("Matched at position: %s", match.start())
764
                                    log.debug("matched in elements: %s", e)
765
                                    if isinstance(replace, etree._Element):
766
                                        log.debug("Will replace with XML CODE")
767
                                    elif isinstance(replace(list, tuple)):
768
                                        log.debug("Will replace with LIST OF ELEMENTS")
769
                                    else:
770
                                        log.debug("Will replace with:", re.sub(search, replace, txtsearch))
771

772
                                curlen = 0
773
                                replaced = False
774
                                for i in e:
775
                                    curlen += len(searchels[i].text)
776
                                    if curlen > match.start() and not replaced:
777
                                        # The match occurred in THIS element. Puth in the
778
                                        # whole replaced text
779
                                        if isinstance(replace, etree._Element):
780
                                            # Convert to a list and process it later
781
                                            replace = [replace]
782
                                        if isinstance(replace, (list, tuple)):
783
                                            # I'm replacing with a list of etree elements
784
                                            # clear the text in the tag and append the element after the
785
                                            # parent paragraph
786
                                            # (because t elements cannot have childs)
787
                                            p = findTypeParent(searchels[i], '{%s}p' % nsprefixes['w'])
788
                                            searchels[i].text = re.sub(search, '', txtsearch)
789
                                            insindex = p.getparent().index(p) + 1
790
                                            for r in replace:
791
                                                p.getparent().insert(insindex, r)
792
                                                insindex += 1
793
                                        else:
794
                                            # Replacing with pure text
795
                                            searchels[i].text = re.sub(search, replace, txtsearch)
796
                                        replaced = True
797
                                        log.debug("Replacing in element #: %s", i)
798
                                    else:
799
                                        # Clears the other text elements
800
                                        searchels[i].text = ''
801
    return newdocument
802

803

804
def getdocumenttext(document):
805
    '''Return the raw text of a document, as a list of paragraphs.'''
806
    paratextlist = []
807
    # Compile a list of all paragraph (p) elements
808
    paralist = []
809
    for element in document.iter():
810
        # Find p (paragraph) elements
811
        if element.tag == '{'+nsprefixes['w']+'}p':
812
            paralist.append(element)
813
    # Since a single sentence might be spread over multiple text elements, iterate through each
814
    # paragraph, appending all text (t) children to that paragraphs text.
815
    for para in paralist:
816
        paratext = u''
817
        # Loop through each paragraph
818
        for element in para.iter():
819
            # Find t (text) elements
820
            if element.tag == '{'+nsprefixes['w']+'}t':
821
                if element.text:
822
                    paratext = paratext+element.text
823
            elif element.tag == '{'+nsprefixes['w']+'}tab':
824
                paratext = paratext + '\t'
825
        # Add our completed paragraph text to the list of paragraph text
826
        if not len(paratext) == 0:
827
            paratextlist.append(paratext)
828
    return paratextlist
829

830

831
def coreproperties(title, subject, creator, keywords, lastmodifiedby=None):
832
    '''Create core properties (common document properties referred to in the 'Dublin Core' specification).
833
    See appproperties() for other stuff.'''
834
    coreprops = makeelement('coreProperties', nsprefix='cp')
835
    coreprops.append(makeelement('title', tagtext=title, nsprefix='dc'))
836
    coreprops.append(makeelement('subject', tagtext=subject, nsprefix='dc'))
837
    coreprops.append(makeelement('creator', tagtext=creator, nsprefix='dc'))
838
    coreprops.append(makeelement('keywords', tagtext=','.join(keywords), nsprefix='cp'))
839
    if not lastmodifiedby:
840
        lastmodifiedby = creator
841
    coreprops.append(makeelement('lastModifiedBy', tagtext=lastmodifiedby, nsprefix='cp'))
842
    coreprops.append(makeelement('revision', tagtext='1', nsprefix='cp'))
843
    coreprops.append(makeelement('category', tagtext='Examples', nsprefix='cp'))
844
    coreprops.append(makeelement('description', tagtext='Examples', nsprefix='dc'))
845
    currenttime = time.strftime('%Y-%m-%dT%H:%M:%SZ')
846
    # Document creation and modify times
847
    # Prob here: we have an attribute who name uses one namespace, and that
848
    # attribute's value uses another namespace.
849
    # We're creating the element from a string as a workaround...
850
    for doctime in ['created', 'modified']:
851
        coreprops.append(etree.fromstring('''<dcterms:'''+doctime+''' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dcterms="http://purl.org/dc/terms/" xsi:type="dcterms:W3CDTF">'''+currenttime+'''</dcterms:'''+doctime+'''>'''))
852
        pass
853
    return coreprops
854

855

856
def appproperties():
857
    """
858
    Create app-specific properties. See docproperties() for more common
859
    document properties.
860

861
    """
862
    appprops = makeelement('Properties', nsprefix='ep')
863
    appprops = etree.fromstring(
864
        '<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Properties x'
865
        'mlns="http://schemas.openxmlformats.org/officeDocument/2006/extended'
866
        '-properties" xmlns:vt="http://schemas.openxmlformats.org/officeDocum'
867
        'ent/2006/docPropsVTypes"></Properties>')
868
    props =\
869
        {'Template':             'Normal.dotm',
870
         'TotalTime':            '6',
871
         'Pages':                '1',
872
         'Words':                '83',
873
         'Characters':           '475',
874
         'Application':          'Microsoft Word 12.0.0',
875
         'DocSecurity':          '0',
876
         'Lines':                '12',
877
         'Paragraphs':           '8',
878
         'ScaleCrop':            'false',
879
         'LinksUpToDate':        'false',
880
         'CharactersWithSpaces': '583',
881
         'SharedDoc':            'false',
882
         'HyperlinksChanged':    'false',
883
         'AppVersion':           '12.0000'}
884
    for prop in props:
885
        appprops.append(makeelement(prop, tagtext=props[prop], nsprefix=None))
886
    return appprops
887

888

889
def websettings():
890
    '''Generate websettings'''
891
    web = makeelement('webSettings')
892
    web.append(makeelement('allowPNG'))
893
    web.append(makeelement('doNotSaveAsSingleFile'))
894
    return web
895

896

897
def relationshiplist():
898
    relationshiplist =\
899
        [['http://schemas.openxmlformats.org/officeDocument/2006/'
900
          'relationships/numbering', 'numbering.xml'],
901
         ['http://schemas.openxmlformats.org/officeDocument/2006/'
902
          'relationships/styles', 'styles.xml'],
903
         ['http://schemas.openxmlformats.org/officeDocument/2006/'
904
          'relationships/settings', 'settings.xml'],
905
         ['http://schemas.openxmlformats.org/officeDocument/2006/'
906
          'relationships/webSettings', 'webSettings.xml'],
907
         ['http://schemas.openxmlformats.org/officeDocument/2006/'
908
          'relationships/fontTable', 'fontTable.xml'],
909
         ['http://schemas.openxmlformats.org/officeDocument/2006/'
910
          'relationships/theme', 'theme/theme1.xml']]
911
    return relationshiplist
912

913

914
def wordrelationships(relationshiplist):
915
    '''Generate a Word relationships file'''
916
    # Default list of relationships
917
    # FIXME: using string hack instead of making element
918
    #relationships = makeelement('Relationships', nsprefix='pr')
919
    relationships = etree.fromstring(
920
        '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006'
921
        '/relationships"></Relationships>')
922
    count = 0
923
    for relationship in relationshiplist:
924
        # Relationship IDs (rId) start at 1.
925
        rel_elm = makeelement('Relationship', nsprefix=None,
926
                              attributes={'Id':     'rId'+str(count+1),
927
                                          'Type':   relationship[0],
928
                                          'Target': relationship[1]}
929
                              )
930
        relationships.append(rel_elm)
931
        count += 1
932
    return relationships
933

934

935
def savedocx(document, coreprops, appprops, contenttypes, websettings, wordrelationships, output):
936
    '''Save a modified document'''
937
    assert os.path.isdir(template_dir)
938
    docxfile = zipfile.ZipFile(output, mode='w', compression=zipfile.ZIP_DEFLATED)
939

940
    # Move to the template data path
941
    prev_dir = os.path.abspath('.')  # save previous working dir
942
    os.chdir(template_dir)
943

944
    # Serialize our trees into out zip file
945
    treesandfiles = {document:     'word/document.xml',
946
                     coreprops:    'docProps/core.xml',
947
                     appprops:     'docProps/app.xml',
948
                     contenttypes: '[Content_Types].xml',
949
                     websettings:  'word/webSettings.xml',
950
                     wordrelationships: 'word/_rels/document.xml.rels'}
951
    for tree in treesandfiles:
952
        log.info('Saving: %s' % treesandfiles[tree])
953
        treestring = etree.tostring(tree, pretty_print=True)
954
        docxfile.writestr(treesandfiles[tree], treestring)
955

956
    # Add & compress support files
957
    files_to_ignore = ['.DS_Store']  # nuisance from some os's
958
    for dirpath, dirnames, filenames in os.walk('.'):
959
        for filename in filenames:
960
            if filename in files_to_ignore:
961
                continue
962
            templatefile = join(dirpath, filename)
963
            archivename = templatefile[2:]
964
            log.info('Saving: %s', archivename)
965
            docxfile.write(templatefile, archivename)
966
    log.info('Saved new file to: %r', output)
967
    docxfile.close()
968
    os.chdir(prev_dir)  # restore previous working dir
969
    return
970

971

972
import sys
973
def main():
974
    try:
975
        if len(sys.argv) == 2:
976
            base = os.path.splitext(sys.argv[1])[0]
977
            newfilename = base + '.txt'
978
        else:
979
            newfilename = sys.argv[2]
980
        if os.path.exists(newfilename):
981
            print("WARNING: %s already exists; doing nothing."%newfilename)
982
            sys.exit(0)
983
        document = opendocx(sys.argv[1])
984
        newfile = open(newfilename,'w')
985
    except IndexError:
986
        print('Please supply an input and output file. For example:')
987
        print('''  example-extracttext.py 'My Office 2007 document.docx' 'outputfile.txt' ''')
988
        exit()
989
    ## Fetch all the text out of the document we just created
990
    paratextlist = getdocumenttext(document)
991

992
    # Make explicit unicode version
993
    newparatextlist = []
994
    for paratext in paratextlist:
995
        newparatextlist.append(paratext.encode("utf-8"))
996

997
    ## Print our documnts test with two newlines under each paragraph
998
    newfile.write('\n\n'.join(newparatextlist))
999
    #print '\n\n'.join(newparatextlist)
1000

1001
if __name__ == '__main__':
1002
    main()    
1003

1004
Product

Resources

Company