0

I have a bunch of documents in docx format containing numbered lists like:

1) Foo
2) Bar

and also nested lists like:

I. Heading
    a) Sub paragraph
II. ...

I'm currently working with docx2python which at least gives some indexes to the list when converting to a python string, but not very reliably. It e.g. gives 1) to multiple paragraphs that are actually numbered with 6., 7. and 8.

Does anybody knows a different package I could use or has a workaround in mind?

Sven
  • 1,014
  • 1
  • 11
  • 27

1 Answers1

0

There's an amazing workaround in this post.

I had to do very minor changes in order to further use the string instead of just printing it, so this is my final solution:

convert.py

import sys
import docx
from docx2python import docx2python as dx2py


def ns_tag_name(node, name):
    if node.nsmap and node.prefix:
        return "{{{:s}}}{:s}".format(node.nsmap[node.prefix], name)
    return name


def descendants(node, desc_strs):
    if node is None:
        return []
    if not desc_strs:
        return [node]
    ret = {}
    for child_str in desc_strs[0]:
        for child in node.iterchildren(ns_tag_name(node, child_str)):
            descs = descendants(child, desc_strs[1:])
            if not descs:
                continue
            cd = ret.setdefault(child_str, [])
            if isinstance(descs, list):
                cd.extend(descs)
            else:
                cd.append(descs)
    return ret


def simplified_descendants(desc_dict):
    ret = []
    for vs in desc_dict.values():
        for v in vs:
            if isinstance(v, dict):
                ret.extend(simplified_descendants(v))
            else:
                ret.append(v)
    return ret


def process_list_data(attrs, dx2py_elem):
    #print(simplified_descendants(attrs))
    desc = simplified_descendants(attrs)[0]
    level = int(desc.attrib[ns_tag_name(desc, "val")])
    elem = [i for i in dx2py_elem[0].split("\t") if i][0]#.rstrip(")")
    return "    " * level + elem + " "


def main(doc):
    fname = doc
    output_string = ""
    docd = docx.Document(fname)
    docdpy = dx2py(fname)
    dr = docdpy.docx_reader
    #print(dr.files)  # !!! Check word/numbering.xml !!!
    docdpy_runs = docdpy.document_runs[0][0][0]
    if len(docd.paragraphs) != len(docdpy_runs):
        print("Lengths don't match. Abort")
        return -1
    subnode_tags = (("pPr",), ("numPr",), ("ilvl",))  # (("pPr",), ("numPr",), ("ilvl", "numId"))  # numId is for matching elements from word/numbering.xml
    for idx, (par, l) in enumerate(zip(docd.paragraphs, docdpy_runs)):
        #print(par.text, l)
        numbered_attrs = descendants(par._element, subnode_tags)
        #print(numbered_attrs)
        if numbered_attrs:
            output_string+=process_list_data(numbered_attrs, l) + par.text
        else:
            output_string+=par.text
        output_string += "\n"
    return output_string
Sven
  • 1,014
  • 1
  • 11
  • 27