So I have a client (this could only come from the government) who has a bunch of MS Word docs they want entered into a database, and short of manual entry, I feel like converting them to XML and parsing them using a utility program would be the best course of action.
I have a utility to do this using code found here on stackoverflow:
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application();
object oMissing = System.Reflection.Missing.Value;
DirectoryInfo dirInfo = new DirectoryInfo(Server.MapPath("\\testfiles"));
FileInfo[] wordFiles = dirInfo.GetFiles("*.doc");
word.Visible = false;
word.ScreenUpdating = false;
XmlDocument xmlDoc = new XmlDocument();
foreach(FileInfo wordFile in wordFiles)
{
Object filename = (Object)wordFile.FullName;
Document doc = word.Documents.Open(ref filename, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing);
doc.Activate();
object outputFileName = wordFile.FullName.Replace(".doc", ".xml");
object fileFormat = WdSaveFormat.wdFormatXML;
doc.SaveAs(ref outputFileName, ref fileFormat, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing,
ref oMissing, ref oMissing, ref oMissing, ref oMissing);
object saveChanges = WdSaveOptions.wdDoNotSaveChanges;
((_Document)doc).Close(ref saveChanges, ref oMissing, ref oMissing);
doc = null;
xmlDoc.Load(outputFileName.ToString());
XmlNamespaceManager nsmgr = new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("w", "http://schemas.microsoft.com/office/word/2003/wordml");
XmlNodeList node = xmlDoc.SelectNodes("//w:document/descendant::w:t|//w:document/descendant::w:p|//w:document/descendant::w:tab", nsmgr);
}
((_Application)word).Quit(ref oMissing, ref oMissing, ref oMissing);
word = null;
Now, my XML file(s) look like this:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:aml="http://schemas.microsoft.com/aml/2001/core"
xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"
xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:w10="urn:schemas-microsoft-com:office:word"
xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"
xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint"
xmlns:wsp="http://schemas.microsoft.com/office/word/2003/wordml/sp2"
xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"
w:macrosPresent="no"
w:embeddedObjPresent="no"
w:ocxPresent="no"
xml:space="preserve">
<w:ignoreSubtree w:val="http://schemas.microsoft.com/office/word/2003/wordml/sp2"/>
<o:DocumentProperties>
...
</o:DocumentProperties>
<w:fonts>
...
</w:fonts>
<w:lists>
...
</w:lists>
<w:styles>
...
</w:styles>
<w:shapeDefaults>...</w:shapeDefaults>
<w:docPr>...</w:docPr>
<w:body>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
<w:pPr>
<w:tabs>
<w:tab w:val="left" w:pos="3312"/>
<w:tab w:val="left" w:pos="4032"/>
<w:tab w:val="left" w:pos="5616"/>
</w:tabs><w:ind w:right="-576"/>
</w:pPr>
</w:p>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
<w:pPr>
<w:jc w:val="center"/>
<w:rPr>
<w:b/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:b/>
</w:rPr>
<w:t>blah blah blach this is sample text</w:t>
</w:r>
</w:p>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
<w:pPr>
<w:jc w:val="center"/>
</w:pPr>
<w:r>
<w:rPr>
<w:b/>
</w:rPr>
<w:t>More sample text</w:t>
</w:r>
</w:p>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
<w:r>
<w:t>Sample Header</w:t>
</w:r>
</w:p>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
<w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
<w:pPr>
<w:pStyle w:val="BodyText"/>
</w:pPr>
<w:r>
<w:rPr>
<w:snapToGrid w:val="off"/>
</w:rPr>
<w:t>Sample Body text.......</w:t>
</w:r>
</w:p>
</w:body>
</w:wordDocument>
I'm no pro, but I think I'm following the letters of the law pretty well here by declaring the namespace manager correctly, so why then, am I getting a null return on the node(s) I am trying to select?
XmlNodeList node = xmlDoc.SelectNodes("//w:document/descendant::w:t|//w:document/descendant::w:p|//w:document/descendant::w:tab", nsmgr);
Am I missing something?