Is there a way to translate a Microsoft word document to a string without using the Microsoft COM component? I am hoping there is some other way to deal with all of the excess markup.
EDIT 12/13/13: We didn't want to reference the com component because if the customer didn't have the exact same version of office installed it wouldn't work. Luckily Microsoft has made the 2013 word.interop.dll backward compatible. Now we don't have to worry about this restriction. Once referencing the dll we can do the following:
/// <summary>Gets the content of the word document</summary>
/// <param name="filePath">The path to the word document file</param>
/// <returns>The content of the document</returns>
public string ExtractText(string filePath)
{
if (string.IsNullOrEmpty(filePath))
throw new ArgumentNullException("filePath", "Input file path not specified.");
if (!File.Exists(filePath))
throw new FileNotFoundException("Input file not found at specified path.", "filepath");
var resultText = string.Empty;
Application wordApp = null;
try
{
wordApp = new Application();
var doc = wordApp.Documents.Open(filePath, Type.Missing, true);
if (doc != null)
{
if (doc.Content != null && !string.IsNullOrEmpty(doc.Content.Text))
resultText = doc.Content.Text.Normalize();
doc.Close();
}
}
finally
{
if (wordApp != null)
wordApp.Quit(false, Type.Missing, false);
}
return resultText;
}