0

Is there a good way to convert a document into JSON representation to then display on a web page? (It is a requirement that the document is converted to JSON)

My Idea if there isn't a built in way to do this is to represent the Run/Paragraph structure as JSON Objects, but I feel like this wouldn't work as well once I start working with more complex Word Documents.

bcgilmartin
  • 29
  • 1
  • 4

1 Answers1

1

If you add:

<dependency>
    <groupId>com.fasterxml.jackson.dataformat</groupId>
    <artifactId>jackson-dataformat-xml</artifactId>
    <version>2.11.3</version>
</dependency>

you can try something like:

import org.docx4j.Docx4J;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;

public class ConvertOutJSON  {

    static String inputfilepath = System.getProperty("user.dir") + "/sample-docs/sample-docxv2.docx";

    public static void main(String[] args)
            throws Exception {
        
        
        WordprocessingMLPackage wordMLPackage 
            = Docx4J.load(new java.io.File(inputfilepath));


        String xml = wordMLPackage.getMainDocumentPart().getXML();

        //System.out.println(xml);

        XmlMapper xmlMapper = new XmlMapper();
        JsonNode node = xmlMapper.readTree(xml);

        ObjectMapper jsonMapper = new ObjectMapper();
        //String json = jsonMapper.writeValueAsString(node);
        String json = jsonMapper.writerWithDefaultPrettyPrinter().writeValueAsString(node);

        System.out.println(json);

    }    
}

However in a quick test, I noticed some w:p nodes were not being emitted as JSON. I haven't looked to see whether they get dropped by Jackson at the readTree step or when ObjectMapper writes its output; you'll need to dig into Jackson to fix that.

It is currently producing output like:

{
  "Ignorable" : "w14 wp14",
  "body" : {
    "p" : {
      "rsidR" : "00D15781",
      "rsidRDefault" : "00D15781",
      "pPr" : {
        "ind" : {
          "left" : "0"
        }
      }
    },
    "tbl" : {
      "tblPr" : {
        "tblStyle" : {
          "val" : "TableGrid"
        },
        "tblW" : {
          "w" : "0",
          "type" : "auto"
        },
        "tblLook" : {
          "firstRow" : "1",
          "lastRow" : "0",
          "firstColumn" : "1",
          "lastColumn" : "0",
          "noHBand" : "0",
          "noVBand" : "1",
          "val" : "04A0"
        }
      },
      "tblGrid" : {
        "gridCol" : {
          "w" : "3561"
        }
      },
      "tr" : {
        "rsidR" : "00D15781",
        "tc" : {
          "tcPr" : {
            "tcW" : {
              "w" : "7122",
              "type" : "dxa"
            },
            "gridSpan" : {
              "val" : "2"
            }
          },
          "p" : {
            "rsidR" : "00D15781",
            "rsidRDefault" : "00945132",
            "pPr" : {
              "ind" : {
                "left" : "0"
              }
            },
            "r" : {
              "t" : "Horizontal merge"
            }
          }
        }
      }
    },
    "sectPr" : {
      "rsidR" : "00D15781",
      "headerReference" : {
        "type" : "default",
        "id" : "rId12"
      },
      "pgSz" : {
        "w" : "11907",
        "h" : "16839",
        "code" : "9"
      },
      "pgMar" : {
        "top" : "720",
        "right" : "720",
        "bottom" : "720",
        "left" : "720",
        "header" : "720",
        "footer" : "720",
        "gutter" : "0"
      },
      "cols" : {
        "space" : "720"
      },
      "docGrid" : {
        "linePitch" : "360"
      }
    }
  }
}
JasonPlutext
  • 15,352
  • 4
  • 44
  • 84
  • Awesome worked pretty well for me. For some reason only the last Paragraph was being printed out from this. I changed reading the XML to look more like this https://stackoverflow.com/a/1823328/14216932 – bcgilmartin Oct 08 '20 at 20:34
  • Follow up question (Let me know if this deserves a new Question on Stack Overflow). Is there a way to set the XML after you change it. For example. let's say I wanted to change something in a text tag, can I change the text in the json, convert it back to String XML, and set it in the MainDocumentPart? – bcgilmartin Oct 08 '20 at 20:37