ankushshah89 · schmamps · Aug 17, 2018 · Aug 17, 2018 · Aug 17, 2018 · Aug 17, 2018
diff --git a/README.md b/README.md
@@ -1,30 +1,99 @@
-# python-docx2txt #
+# python-docx2txt
 
-A pure python-based utility to extract text from docx files. 
+## Introduction
 
-The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx). It can however also extract text from header, footer and hyperlinks. __It can now also extract images.__ 
+A pure Python-based utility to extract text from docx files.
 
-## How to install? ##
-```bash
+The code is taken and adapted from [python-docx](https://github.com/python-openxml/python-docx).
+It can however also extract text from header, footer and hyperlinks.
+__It can now also extract images and properties.__
+
+It can be used as a [Python library](#python-library)
+or from the [command line](#command-line-utility).
+
+## Python Library
+
+### Library Installation
+
+```sh
 pip install docx2txt
 ```
 
-## How to run? ##
+### Library Usage
 
-a. From command line:
-```bash
-# extract text
-docx2txt file.docx
-# extract text and images
-docx2txt -i /tmp/img_dir file.docx
+#### Procedural
+
+The library is easy to use procedurally.
+
+```py
+>>> import docx2txt
+>>> # get document text
+>>> docx2txt.process('file.docx')
+'header_textmain_textfooter_text'
+>>> # or
+>>> # get document text, extract images to /tmp/img_dir
+>>> process('file.docx', img_dir='/tmp/img_dir/')
+'header_textmain_textfooter_text'
+```
+
+#### Object Oriented
+
+The DocxFile class provides more granularity.
+Its argument list and accompanying behaviors are identical to `process()`.
+Document properties are stored as a dictionary.
+No keys are guaranteed, so the get() method is recommended.
+
+```py
+>>> import docx2txt
+>>> # parse Word doc
+>>> document = docx2txt.DocxFile('file.docx', img_dir='/tmp/img_dir/')
+>>> # path to file
+>>> document.path
+'/absolute/path/to/file.docx'
+>>> # all document text
+>>> document.text
+'header_textmain_textfooter_text'
+>>> # image directory
+>>> document.img_dir
+>>> '/tmp/img_dir'
+>>> # text components
+>>> '||'.join([document.header, document.main, document.footer])
+'header_text||main_text||footer_text'
+>>> # images (filename only if not extracted)
+>>> document.images
+['/tmp/img_dir/image1.jpg', '/tmp/img_dir/image2.jpg']
+>>> # document properties
+>>> document.properties
+{'property_name': 'property value', ...}
+>>> document.properties['title']
+'title_text'
+>>> document.properties['nonexistent']
+KeyError
+>>> document.properties.get('nonexistent')
+None
 ```
-b. From python:
-```python
-import docx2txt
 
-# extract text
-text = docx2txt.process("file.docx")
+## Command Line Utility
+
+### Utility Installation
+
+With this README file as the working directory:
 
-# extract text and write images in /tmp/img_dir
-text = docx2txt.process("file.docx", "/tmp/img_dir") 
+```sh
+python setup.py install
 ```
+
+### Utility Usage
+
+```sh
+# simple text extraction
+docx2txt file.docx
+# get text, extract images to /tmp/img_dir
+docx2txt -i /tmp/img_dir file.docx
+# get all document data
+docx2txt -d file.docx
+# get all data, extract images to /tmp/img_dir
+docx2txt -d -i /tmp/img_dir file.docx
+# same as previous, more simply:
+docx2txt -di /tmp/img_dir file.docx
+```
diff --git a/bin/docx2txt b/bin/docx2txt
@@ -4,6 +4,5 @@ import docx2txt
 
 if __name__ == '__main__':
     import sys
-    args = docx2txt.process_args()
-    text = docx2txt.process(args.docx, args.img_dir)
-    sys.stdout.write(text.encode('utf-8'))
+    for line in docx2txt.get_output():
+        sys.stdout.write(line)
diff --git a/docx2txt/__init__.py b/docx2txt/__init__.py
@@ -1,4 +1,4 @@
-from .docx2txt import process
-from .docx2txt import process_args
+from .docx2txt import get_output, process  # noqa
+from .docx_file import DocxFile            # noqa
 
-VERSION = '0.7'
+VERSION = '0.8'
diff --git a/docx2txt/docx2txt.py b/docx2txt/docx2txt.py
@@ -1,113 +1,68 @@
 #! /usr/bin/env python
 
 import argparse
-import re
-import xml.etree.ElementTree as ET
-import zipfile
 import os
 import sys
 
-
-nsmap = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+from . import docx_file
 
 
 def process_args():
-    parser = argparse.ArgumentParser(description='A pure python-based utility '
-                                                 'to extract text and images '
-                                                 'from docx files.')
-    parser.add_argument("docx", help="path of the docx file")
-    parser.add_argument('-i', '--img_dir', help='path of directory '
-                                                'to extract images')
+    """Parse command line arguments if invoked directly
+
+    Returns:
+        object -- .img_dir: output directory, .details: get document details
+    """
+    desc = 'A pure Python-based utility to extract data from docx files.'
+    id_help = 'path of directory to extract images'
+    ad_help = 'get all document data'
+
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('docx', help='path of the docx file')
+    parser.add_argument('-i', '--img_dir', help=id_help)
+    parser.add_argument('-d', '--details', help=ad_help, action='store_true')
 
     args = parser.parse_args()
 
     if not os.path.exists(args.docx):
-        print('File {} does not exist.'.format(args.docx))
+        sys.stderr.write('File {!r} does not exist.'.format(args.docx))
         sys.exit(1)
 
     if args.img_dir is not None:
         if not os.path.exists(args.img_dir):
             try:
                 os.makedirs(args.img_dir)
             except OSError:
-                print("Unable to create img_dir {}".format(args.img_dir))
+                sys.stderr.write(
+                    'Unable to create img_dir {!r}'.format(args.img_dir))
                 sys.exit(1)
     return args
 
 
-def qn(tag):
-    """
-    Stands for 'qualified name', a utility function to turn a namespace
-    prefixed tag name into a Clark-notation qualified tag name for lxml. For
-    example, ``qn('p:cSld')`` returns ``'{http://schemas.../main}cSld'``.
-    Source: https://github.com/python-openxml/python-docx/
-    """
-    prefix, tagroot = tag.split(':')
-    uri = nsmap[prefix]
-    return '{{{}}}{}'.format(uri, tagroot)
+def process(docx, img_dir=None):
+    document = docx_file.DocxFile(docx, img_dir)
+    return document
 
 
-def xml2text(xml):
-    """
-    A string representing the textual content of this run, with content
-    child elements like ``<w:tab/>`` translated to their Python
-    equivalent.
-    Adapted from: https://github.com/python-openxml/python-docx/
-    """
-    text = u''
-    root = ET.fromstring(xml)
-    for child in root.iter():
-        if child.tag == qn('w:t'):
-            t_text = child.text
-            text += t_text if t_text is not None else ''
-        elif child.tag == qn('w:tab'):
-            text += '\t'
-        elif child.tag in (qn('w:br'), qn('w:cr')):
-            text += '\n'
-        elif child.tag == qn("w:p"):
-            text += '\n\n'
-    return text
+def detail_text(prop_name, prop_val):
+    return '{:10s}: {!r}\n'.format(prop_name, prop_val)
 
 
-def process(docx, img_dir=None):
-    text = u''
-
-    # unzip the docx in memory
-    zipf = zipfile.ZipFile(docx)
-    filelist = zipf.namelist()
-
-    # get header text
-    # there can be 3 header files in the zip
-    header_xmls = 'word/header[0-9]*.xml'
-    for fname in filelist:
-        if re.match(header_xmls, fname):
-            text += xml2text(zipf.read(fname))
-
-    # get main text
-    doc_xml = 'word/document.xml'
-    text += xml2text(zipf.read(doc_xml))
-
-    # get footer text
-    # there can be 3 footer files in the zip
-    footer_xmls = 'word/footer[0-9]*.xml'
-    for fname in filelist:
-        if re.match(footer_xmls, fname):
-            text += xml2text(zipf.read(fname))
-
-    if img_dir is not None:
-        # extract images
-        for fname in filelist:
-            _, extension = os.path.splitext(fname)
-            if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
-                dst_fname = os.path.join(img_dir, os.path.basename(fname))
-                with open(dst_fname, "wb") as dst_f:
-                    dst_f.write(zipf.read(fname))
-
-    zipf.close()
-    return text.strip()
+def get_output():
+    args = process_args()
+    document = process(args.docx, args.img_dir)
+
+    if args.details:
+        yield detail_text('path', document.path)
+        yield detail_text('header', document.header)
+        yield detail_text('main', document.main)
+        yield detail_text('footer', document.footer)
+        yield detail_text('images', document.images)
+        yield detail_text('properties', document.properties)
+    else:
+        yield document.text
 
 
 if __name__ == '__main__':
-    args = process_args()
-    text = process(args.docx, args.img_dir)
-    sys.stdout.write(text.encode('utf-8'))
+    for line in get_output():
+        sys.stdout.write(line)