So I am currently trying to convert a word doc (.doc) into a text document because I want to use regular expressions on it to find things in the document. So I came up with the below and it converts the word document into a rich text format (by appending it to a rich text box), but this does not translate into a plain text format. When I tried with regular text document it printed every word on a new line. I have not been able to find any information on how to do this in C#. I'm using C# and visual studio 2010.
I do not expect any special characters in the document (like bold, underlines, etc.), but if someone knows how I can be robust and extract those that would be super awesome.
I want it as a text document because there's several methods I know I can use on regular text, but I doubt they would work on word text due to hidden/special characters that come with word docs.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.Office.Interop.Word;
namespace ReadWordDocProject
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
string testFile = @"C:\Users\<mycomputer>\Documents\TestItemHelpers\TestWordDoc.docx";
Microsoft.Office.Interop.Word.Application application = new Microsoft.Office.Interop.Word.Application();
Document document = application.Documents.Open(testFile);//path here
int count = document.Words.Count;
for (int i = 1; i <= count; i++)
{
string text = document.Words[i].Text;
//Do output with text here
richTextBox1.AppendText(text);
}
((_Application)application).Quit(); //cast as _Application because there's ambiguity
}
}
}