Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 88 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,99 @@
# python-docx2txt #
# python-docx2txt

A pure python-based utility to extract text from docx files.
## Introduction

The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__
A pure Python-based utility to extract text from docx files.

## How to install? ##
```bash
The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx).
It can however also extract text from header, footer and hyperlinks.
__It can now also extract images and properties.__

It can be used as a [Python library](#python-library)
or from the [command line](#command-line-utility).

## Python Library

### Library Installation

```sh
pip install docx2txt
```

## How to run? ##
### Library Usage

a. From command line:
```bash
# extract text
docx2txt file.docx
# extract text and images
docx2txt -i /tmp/img_dir file.docx
#### Procedural

The library is easy to use procedurally.

```py
>>> import docx2txt
>>> # get document text
>>> docx2txt.process('file.docx')
'header_textmain_textfooter_text'
>>> # or
>>> # get document text, extract images to /tmp/img_dir
>>> process('file.docx', img_dir='/tmp/img_dir/')
'header_textmain_textfooter_text'
```

#### Object Oriented

The DocxFile class provides more granularity.
Its argument list and accompanying behaviors are identical to `process()`.
Document properties are stored as a dictionary.
No keys are guaranteed, so the get() method is recommended.

```py
>>> import docx2txt
>>> # parse Word doc
>>> document = docx2txt.DocxFile('file.docx', img_dir='/tmp/img_dir/')
>>> # path to file
>>> document.path
'/absolute/path/to/file.docx'
>>> # all document text
>>> document.text
'header_textmain_textfooter_text'
>>> # image directory
>>> document.img_dir
>>> '/tmp/img_dir'
>>> # text components
>>> '||'.join([document.header, document.main, document.footer])
'header_text||main_text||footer_text'
>>> # images (filename only if not extracted)
>>> document.images
['/tmp/img_dir/image1.jpg', '/tmp/img_dir/image2.jpg']
>>> # document properties
>>> document.properties
{'property_name': 'property value', ...}
>>> document.properties['title']
'title_text'
>>> document.properties['nonexistent']
KeyError
>>> document.properties.get('nonexistent')
None
```
b. From python:
```python
import docx2txt

# extract text
text = docx2txt.process("file.docx")
## Command Line Utility

### Utility Installation

With this README file as the working directory:

# extract text and write images in /tmp/img_dir
text = docx2txt.process("file.docx", "/tmp/img_dir")
```sh
python setup.py install
```

### Utility Usage

```sh
# simple text extraction
docx2txt file.docx
# get text, extract images to /tmp/img_dir
docx2txt -i /tmp/img_dir file.docx
# get all document data
docx2txt -d file.docx
# get all data, extract images to /tmp/img_dir
docx2txt -d -i /tmp/img_dir file.docx
# same as previous, more simply:
docx2txt -di /tmp/img_dir file.docx
```
5 changes: 2 additions & 3 deletions bin/docx2txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@ import docx2txt

if __name__ == '__main__':
import sys
args = docx2txt.process_args()
text = docx2txt.process(args.docx, args.img_dir)
sys.stdout.write(text.encode('utf-8'))
for line in docx2txt.get_output():
sys.stdout.write(line)
6 changes: 3 additions & 3 deletions docx2txt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .docx2txt import process
from .docx2txt import process_args
from .docx2txt import get_output, process # noqa
from .docx_file import DocxFile # noqa

VERSION = '0.7'
VERSION = '0.8'
119 changes: 37 additions & 82 deletions docx2txt/docx2txt.py
Original file line number Diff line number Diff line change
@@ -1,113 +1,68 @@
#! /usr/bin/env python

import argparse
import re
import xml.etree.ElementTree as ET
import zipfile
import os
import sys


nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
from . import docx_file


def process_args():
parser = argparse.ArgumentParser(description='A pure python-based utility '
'to extract text and images '
'from docx files.')
parser.add_argument("docx", help="path of the docx file")
parser.add_argument('-i', '--img_dir', help='path of directory '
'to extract images')
"""Parse command line arguments if invoked directly
Returns:
object -- .img_dir: output directory, .details: get document details
"""
desc = 'A pure Python-based utility to extract data from docx files.'
id_help = 'path of directory to extract images'
ad_help = 'get all document data'

parser = argparse.ArgumentParser(description=desc)
parser.add_argument('docx', help='path of the docx file')
parser.add_argument('-i', '--img_dir', help=id_help)
parser.add_argument('-d', '--details', help=ad_help, action='store_true')

args = parser.parse_args()

if not os.path.exists(args.docx):
print('File {} does not exist.'.format(args.docx))
sys.stderr.write('File {!r} does not exist.'.format(args.docx))
sys.exit(1)

if args.img_dir is not None:
if not os.path.exists(args.img_dir):
try:
os.makedirs(args.img_dir)
except OSError:
print("Unable to create img_dir {}".format(args.img_dir))
sys.stderr.write(
'Unable to create img_dir {!r}'.format(args.img_dir))
sys.exit(1)
return args


def qn(tag):
"""
Stands for 'qualified name', a utility function to turn a namespace
prefixed tag name into a Clark-notation qualified tag name for lxml. For
example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
Source: https://github.com/python-openxml/python-docx/
"""
prefix, tagroot = tag.split(':')
uri = nsmap[prefix]
return '{{{}}}{}'.format(uri, tagroot)
def process(docx, img_dir=None):
document = docx_file.DocxFile(docx, img_dir)
return document


def xml2text(xml):
"""
A string representing the textual content of this run, with content
child elements like ``<w:tab/>`` translated to their Python
equivalent.
Adapted from: https://github.com/python-openxml/python-docx/
"""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
if child.tag == qn('w:t'):
t_text = child.text
text += t_text if t_text is not None else ''
elif child.tag == qn('w:tab'):
text += '\t'
elif child.tag in (qn('w:br'), qn('w:cr')):
text += '\n'
elif child.tag == qn("w:p"):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

qn() can be called as many as four times per node!

text += '\n\n'
return text
def detail_text(prop_name, prop_val):
return '{:10s}: {!r}\n'.format(prop_name, prop_val)


def process(docx, img_dir=None):
text = u''

# unzip the docx in memory
zipf = zipfile.ZipFile(docx)
filelist = zipf.namelist()

# get header text
# there can be 3 header files in the zip
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))

# get main text
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))

# get footer text
# there can be 3 footer files in the zip
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))

if img_dir is not None:
# extract images
for fname in filelist:
_, extension = os.path.splitext(fname)
if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
dst_fname = os.path.join(img_dir, os.path.basename(fname))
with open(dst_fname, "wb") as dst_f:
dst_f.write(zipf.read(fname))

zipf.close()
return text.strip()
def get_output():
args = process_args()
document = process(args.docx, args.img_dir)

if args.details:
yield detail_text('path', document.path)
yield detail_text('header', document.header)
yield detail_text('main', document.main)
yield detail_text('footer', document.footer)
yield detail_text('images', document.images)
yield detail_text('properties', document.properties)
else:
yield document.text


if __name__ == '__main__':
args = process_args()
text = process(args.docx, args.img_dir)
sys.stdout.write(text.encode('utf-8'))
for line in get_output():
sys.stdout.write(line)
Loading