1

I'm creating a spyware program that craws the entire file system of a computer and looks for any text that it can find and sends it to my accompanying web service. The problem I'm having is, once I have a file, it is either a type of file from which I can extract text (i.e. if it's a .txt., .docx, .xml, etc. file) or it's not. If it is, I want to extract the text from it. What I have right now is

    private string _accumulatedInfo;

    public FileCrawler ( )
    {
        this._accumulatedInfo = String.Empty;

    }

    private void GrabInfo ( System.IO.FileInfo fi )
    {
        // if can parse text out of file, add text to accumulated info string
        // ... 
    }

    private void _WalkDirectoryTree ( System.IO.DirectoryInfo root )
    {
        System.IO.FileInfo[] files = root.GetFiles("*.*");
        if ( files != null )
        {
            foreach ( System.IO.FileInfo fi in files )
            {
                GrabInfo(fi);   
            }
        }

        System.IO.DirectoryInfo[] subDirs = root.GetDirectories();
        if ( subDirs != null )
        {
            foreach ( System.IO.DirectoryInfo dirInfo in subDirs )
            {
                this._WalkDirectoryTree(dirInfo);
            }
        }
    }

    private void CrawlAllDrives ( )
    {
        string[] drives = System.Environment.GetLogicalDrives();
        foreach ( string dr in drives )
        {
            System.IO.DriveInfo di = new System.IO.DriveInfo(dr);
            if ( di.IsReady )
            {
                System.IO.DirectoryInfo rootDir = di.RootDirectory;
                this._WalkDirectoryTree(rootDir);
            }
        }
    }

and I'm wondering how to, or whether it's even possible to, implement my

    private void GrabInfo ( System.IO.FileInfo fi )
    {
        // if can parse text out of file, add text to accumulated info string
        // ... 
    }

method without resorting to something like

    private void GrabInfo ( System.IO.FileInfo fi )
    {
        switch (fi.Extension)
        {
             case "txt":
                // ... 
             case "docx":
                // ...
             // ... 
        }
    }

Does there exist some generic way of extracting text from a file?

user6048670
  • 2,861
  • 4
  • 16
  • 20
  • 1
    I'm not sure what you mean by "generic way of extracting text", but you can use the `File.ReadAllLines` or it's equivalents ? – Noctis Apr 11 '16 at 04:55
  • Do you mean like, a way to generically get the text of any kind of file, including rtf, txt, docx, etc. via one single function call? – RoyalPotato Apr 11 '16 at 05:02

1 Answers1

2

You can do something like the following:

System.IO.DirectoryInfo path =  new DirectoryInfo( @"c:\temp");

System.IO.FileInfo[] files = path.GetFiles("*.*");
if ( files != null )
{
    foreach ( System.IO.FileInfo fi in files.Where(f => MimeMapping.GetMimeMapping(f.FullName).StartsWith("text/")))
    {
        try 
        {
            var text = File.ReadAllText(fi.FullName);
        }
        catch 
        {
            // something bad happened
        }
    }
}

This will give you back all the types that are "text" like.

You can read more about it in this answer.

Community
  • 1
  • 1
Noctis
  • 11,507
  • 3
  • 43
  • 82