How to OCR serially with MODI (Microsoft Office Document Imaging) in C#

Question

I have documents scanned as .jpg pictures in a folder and I would like to do OCR in C# serially for each of my documents in that folder. so far ive done this :

public string CheckFilesAndDoOCR(string directoryPath)
{
    directoryPath = Environment.SpecialFolder.MyPictures + "\\OCRTempPictures\\";
    IEnumerator files = Directory.GetFiles(directoryPath).GetEnumerator();
    string TheTxt = "";

    while (files.MoveNext())
    {
        // FileInfo
        FileInfo nfo = new FileInfo(Convert.ToString(files.Current));

        // Get new file name
        string fileName = AlltoJPG(nfo);

        // FileInfo (New File)
        FileInfo foo = new FileInfo(fileName);

        // Check for JPG File Format
        if (foo.Extension == ".jpg" || foo.Extension == ".JPG")
        // or // ImageFormat.Jpeg.ToString()
        {
            try
            {
                // OCR Operations...
                MODI.Document md = new MODI.Document();
                md.Create(foo.FullName);
                md.OCR(MODI.MiLANGUAGES.miLANG_ENGLISH, false, false); // OCR();
                MODI.Image image = (MODI.Image)md.Images[0];
                TheTxt = image.Layout.Text;
                md.Close(false);

                // Create text file with the same Image file name
                FileStream createFile = new FileStream(foo.DirectoryName + "\\" + foo.Name.Replace(foo.Extension,string.Empty) + ".txt", FileMode.CreateNew);

                // Save the image text in the text file
                StreamWriter writeFile = new StreamWriter(createFile);
                writeFile.Write(TheTxt);
                writeFile.Close();
            }
            catch (Exception ex)
            {
                // Expected errors
                string LogPath = System.Environment.SpecialFolder.MyPictures + "\\OCRTempPictures\\OCRInfo.txt";
                Logger(LogPath, "| Exception: Source[" + ex.Source + "] Message[" + ex.Message + "] InnerException[" + ex.InnerException + "] StackTrace[" + ex.StackTrace + "] | ");
                // MessageBox.Show(ex.Message, "OCR Exception", MessageBoxButtons.OK, MessageBoxIcon.Information);
            }
        }
    }
    return TheTxt;
}

but MODI gives the OCR running! or Cant reach file.File is in use. errors..

Depending on the situation:

How can I avoid these error?
Is there anyways to stop OCR action and deplete all objects in use?

If anyone can answer any of the questions above it would be appreciated.

Did you check this thread? http://stackoverflow.com/questions/6699740/ocr-running-error-when-using-modi-2003-with-c-sharp This is a generic error which means that MODI has trouble recognizing the bitmap — Panagiotis Kanavos, Jun 18 '12 at 13:03
@PanagiotisKanavos yes i did! but those answers doesnt solves my issue.. it recognizes all characters and im using jpeg files and also after working on it for a real long time i found out most of the issues but still most crazy issue exists. Its not letting me move-delete that file that i had the ocr results. idk why it does that. says the file still being used. ill update question. — Berker Yüceer, Jun 18 '12 at 13:23
The reason you are getting this error is because you are attempting to process multiple images at once. Implement code to prevent this. — Security Hound, Jun 18 '12 at 14:19
@Ramhound ahh crap! you are right. i dont know how much i can thank to you! You should post that as an answer so i can give you the rep. — Berker Yüceer, Jun 18 '12 at 14:45

score 2 · Accepted Answer · answered Jun 20 '12 at 15:06

Here is the fully working code! thanks to @Ramhound

Below code just specifies a folder full of pictures and one by one does OCR scan on them.

    /// <summary>
    /// Gets all images inside a Folder
    /// and triggers OCR on each..
    /// </summary>
    /// <param name="directoryPath"> Path to Folder </param>
    /// <returns> Text </returns>        
    public string CheckFileAndDoOCR(string directoryPath)
    {
        string TheTxt = "";
        IEnumerator files = Directory.GetFiles(directoryPath).GetEnumerator();

        while (files.MoveNext())
        {
            // FileInfo
            FileInfo foo = new FileInfo(Convert.ToString(files.Current));

            // Check for JPG File Format
            if (foo.Extension == ".jpg" || foo.Extension == ".JPG")
            // or // ImageFormat.Jpeg.ToString()
            {
                // Start OCR Procedure
                TheTxt = DoOCR(foo.FullName);
                // Create TXT file next to ImageFile
                string txtFileName = foo.DirectoryName + "\\" + foo.Name.Replace(foo.Extension,"") + ".txt";
                FileStream createFile = new FileStream(txtFileName, FileMode.OpenOrCreate);
                // Save the text in to TXT file
                StreamWriter writeFile = new StreamWriter(createFile);
                writeFile.Write(TheTxt);
                // Close
                writeFile.Close();
                createFile.Close();
            }

            // Delete used pictures (Optional)
            /*--------------------------------------------------------------------*/
            try 
            { foo.Delete(); }
            catch (Exception ex)
            { Logger(LogPath, "| Exception: Source[" + ex.Source + "] Message[" + ex.Message + 
                "] InnerException[" + ex.InnerException + "] StackTrace[" + ex.StackTrace + "] | "); }
            /*--------------------------------------------------------------------*/
        }
        return TheTxt;
    }
    // DoOCR
    // 
    /// <summary>
    /// Start an OCR scan on given ImageFile
    /// </summary>
    /// <param name="FullPath"> Path to ImageFile </param>
    /// <returns> Text </returns>
    public string DoOCR(string FullPath)
    {
        string txt;

        // OCR Operations...
        MODI.Document md = new MODI.Document(); // Create MODI.Document
        md.Create(FullPath); // Fill MODI.Document with my file
        // Showprogress of OCR
        md.OnOCRProgress += new MODI._IDocumentEvents_OnOCRProgressEventHandler(this.ShowProgress);
        // Begin OCR
        md.OCR(MODI.MiLANGUAGES.miLANG_ENGLISH, false, false); // OCR();
        // Image from file
        MODI.Image image = (MODI.Image)md.Images[0];
        txt = image.Layout.Text;
        // Optionally you can get only first word by using word.Text
        /// Words from Image :
        // MODI.Word word = image.Layout.Words[0];
        /// Text from first Word :
        // txt = word.Text;

        // Close OCR
        word = null;
        image = null;
        md.Close(false);
        md = null;

        // Finalize
        GC.Collect();
        GC.WaitForPendingFinalizers();

        // Return Text
        return txt;
    }

score 1 · Answer 2 · edited Jan 21 '14 at 10:43

1

This is because Doc1.OCR() check for multi-page tiff file if you insert only single page file then it will shows that error try using multi-page tiff file

edited Jan 21 '14 at 10:43

Shoaib Chikate

8,665
12
47
70

answered Jan 21 '14 at 10:16

Akash Dhone

11
1

How to OCR serially with MODI (Microsoft Office Document Imaging) in C#

2 Answers2