2

I am creating a tool which analyze the data quality of a file. So i need to read each line of the file and analyze each one of them. I also need to store in memory all the lines of my file as the user will be able to drill down to specific sections. So basically all works fine for a file containing thousands of rows. However when trying with a CSV file containing more than 4 millions rows i m getting an out of memory exception. I thought C# would be able to handle a few millions of data in its memory cache but does not seem like it. So i am a bit stuck and dont know what to do. Maybe my piece of code is not the most performant so if you can tell me a way to improve it that would be great? Just to keep in mind that i need to have all lines of the file in memory because depending of the user's action i need to access speciifc lines to display them to the user.

Below is the call which reads every single line

using (FileStream fs = File.Open(this.dlgInput.FileName.ToString(),   FileMode.Open, FileAccess.Read, FileShare.Read))
using (BufferedStream bs = new BufferedStream(fs))
using (System.IO.StreamReader sr = new  StreamReader(this.dlgInput.FileName.ToString(), Encoding.Default, false, 8192))
{
    string line;
    if (this.chkSkipHeader.Checked)
    {
        sr.ReadLine();
    }

    progressBar1.Visible = true;
    int nbOfLines = File.ReadLines(this.dlgInput.FileName.ToString()).Count();
    progressBar1.Maximum = nbOfLines;

    this.lines = new string[nbOfLines][];
    this.patternedLines = new string[nbOfLines][];
    for (int i = 0; i < nbOfLines; i++)
    {
        this.lines[i] = new string[this.dgvFields.Rows.Count];
        this.patternedLines[i] = new string[this.dgvFields.Rows.Count];
    }

    // Read and display lines from the file until the end of 
    // the file is reached.
    while ((line = sr.ReadLine()) != null)
    {
        this.recordCount += 1;
        char[] c = new char[1] { ',' };
        System.Text.RegularExpressions.Regex CSVParser = new System.Text.RegularExpressions.Regex(",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))");
        String[] fields = CSVParser.Split(line);
        ParseLine(fields);
        this.lines[recordCount - 1] = fields;
        progressBar1.PerformStep();
    }
}

And below is the ParseLine function which also keeps in memory via arrays some analysis needed:

private void ParseLine(String[] fields2)
{
    for (int j = 0; j <= fields2.Length - 1; j++)
    {
        if ((int)this.dgvFields.Rows[j].Cells["colSelected"].Value == 1)
        {
            /*' ************************************************
            ' Save Number of Counts by  Value
            ' ************************************************/

            if (this.values[j].ContainsKey(fields2[j]))
            {
                //values[0] = Dictionary<"TEST", 1> (fields2[0 which is source code] = count])
                this.values[j][fields2[j]] += 1;
            }
            else
            {
                this.values[j].Add(fields2[j], 1);
            }

            /* ' ************************************************
            ' Save Pattern Values/Counts
            ' ************************************************/

            string tmp = System.Text.RegularExpressions.Regex.Replace(fields2[j], "\\p{Lu}", "X");
            tmp = System.Text.RegularExpressions.Regex.Replace(tmp, "\\p{Ll}", "x");
            tmp = System.Text.RegularExpressions.Regex.Replace(tmp, "[0-9]", "0");


            if (this.patterns[j].ContainsKey(tmp))
            {
                this.patterns[j][tmp] += 1;
            }
            else
            {
                this.patterns[j].Add(tmp, 1);
            }

            this.patternedLines[this.recordCount - 1][j] = tmp;
            /* ' ************************************************
             ' Count Blanks/Alpha/Numeric/Phone/Other
             ' ************************************************/


            if (String.IsNullOrWhiteSpace(fields2[j]))
            {
                this.blanks[j] += 1;
            }
            else if (System.Text.RegularExpressions.Regex.IsMatch(fields2[j], "^[0-9]+$"))
            {
                this.numeric[j] += 1;
            }
            else if (System.Text.RegularExpressions.Regex.IsMatch(fields2[j].ToUpper().Replace("EXTENSION", "").Replace("EXT", "").Replace("X", ""), "^[0-9()\\- ]+$"))
            {
                this.phone[j] += 1;
            }
            else if (System.Text.RegularExpressions.Regex.IsMatch(fields2[j], "^[a-zA-Z ]+$"))
            {
                this.alpha[j] += 1;
            }
            else
            {
                this.other[j] += 1;
            }

            if (this.recordCount == 1)
            {
                this.high[j] = fields2[j];
                this.low[j] = fields2[j];
            }
            else
            {
                if (fields2[j].CompareTo(this.high[j]) > 0)
                {
                    this.high[j] = fields2[j];
                }

                if (fields2[j].CompareTo(this.low[j]) < 0)
                {
                    this.low[j] = fields2[j];
                }
            }
        }
    }
}

UPDATE: new piece of code

int nbOfLines = File.ReadLines(this.dlgInput.FileName.ToString()).Count();
        //Read file

        using (System.IO.StreamReader sr = new StreamReader(this.dlgInput.FileName.ToString(), Encoding.Default, false, 8192))
        {
            string line;
            if (this.chkSkipHeader.Checked)
            { sr.ReadLine(); }
            progressBar1.Visible = true;

            progressBar1.Maximum = nbOfLines;
            this.lines = new string[nbOfLines][];
            this.patternedLines = new string[nbOfLines][];
            for (int i = 0; i < nbOfLines; i++)
            {
                this.lines[i] = new string[this.dgvFields.Rows.Count];
                this.patternedLines[i] = new string[this.dgvFields.Rows.Count];
            }

            // Read and display lines from the file until the end of 
            // the file is reached.
            while ((line = sr.ReadLine()) != null)
            {
                this.recordCount += 1;
                char[] c = new char[1] { ',' };
                System.Text.RegularExpressions.Regex CSVParser = new System.Text.RegularExpressions.Regex(",(?=(?:[^\"]*\"[^\"]*\")*(?![^\"]*\"))");
                String[] fields = CSVParser.Split(line);
                ParseLine(fields);
                this.lines[recordCount - 1] = fields;
                progressBar1.PerformStep();
            }
        }
Mélanie
  • 31
  • 2
  • 10
  • 2
    Please format your code correctly – byxor Aug 24 '16 at 14:30
  • 7
    c# can't create memory out of nothing. if you have more data than will fit into your system's ram and/or virtual ram, then you're stuck. either change how your code works to reduce the memory load, or get more memory. – Marc B Aug 24 '16 at 14:32
  • 1
    i have 4 cores and 16GB of RAM - is not it enough for a 4Millions rows file? – Mélanie Aug 24 '16 at 14:33
  • 1
    Stream through the file instead of loading it all into memory at one time. – ManoDestra Aug 24 '16 at 14:36
  • I m using the StreamReader is not that streaming through it? If you mean something else can you please clarify what to use exactly as i m a bit new in file processing. thanks – Mélanie Aug 24 '16 at 14:38
  • You're reading the whole file here: `int nbOfLines = File.ReadLines(this.dlgInput.FileName.ToString()).Count();`. You're streaming through the file and for each line you're reading the entire file into memory. You could do that count line above your loop, if you absolutely must have the count. The number of lines in the file isn't going to change each time you go through the loop. – ManoDestra Aug 24 '16 at 14:41
  • You're streaming through the file, but you're *stil* loading the *entire* thing into memory. – Tim Aug 24 '16 at 14:42
  • Also, why are you parsing the file manually? There are a number of libraries available to do this for you, and there's also the [TextFieldParser](https://msdn.microsoft.com/en-us/library/microsoft.visualbasic.fileio.textfieldparser(v=vs.110).aspx) – Tim Aug 24 '16 at 14:45
  • Also, you're using multiple streams unnecessarily in your using statements. You don't even seem to be using the fs or bs variables at all? – ManoDestra Aug 24 '16 at 14:45
  • 2
    File.ReadLines does not load entire file into memory. It uses internally FileStream, and returns IEnumerable. – apocalypse Aug 24 '16 at 14:50
  • It's still nested within another request though. As well as several streams being open at the same time on the file. That's the key problem here. There's no clean process through the file. – ManoDestra Aug 24 '16 at 14:55
  • @ManoDestra : i m calling the File.Readline before my loop already – Mélanie Aug 24 '16 at 14:57
  • And you're calling File.ReadLines INSIDE your stream. Do that call first to get the count of lines in the file before you start processing your file. And lose any streams that you do not require, instead creating one from another as required. – ManoDestra Aug 24 '16 at 14:58
  • @Tim: i m parsing the file to analyze the data such as for eaxmple getting the number of blank values for each column or getting the min/max value par column. You say i m loading the entire thing in memory: is that because i call the file.readline? If so is there another way to get the total number of rows from a file? – Mélanie Aug 24 '16 at 15:01
  • @Mélanie - You're right - I missed the call to `ParseLine` in your `while` loop, sorry. – Tim Aug 24 '16 at 15:02
  • @Mélanie Move that line I referred to above (regarding File.ReadLines, getting the count) BEFORE your stream reader code. You only need to get a count of the number of lines once to enable you to do your progress calculation. Then, stream through the file processing it line by line, only creating the streams that are absolutely required to do what you need to do with the file. – ManoDestra Aug 24 '16 at 15:03
  • @ManoDestra: ok i removed the 2 fisrt "using" and move the readLine before the using streamREader - thanks for this. Still getting the out of memory exception unfortunately – Mélanie Aug 24 '16 at 15:04
  • What's the file size of the file that you're processing? – ManoDestra Aug 24 '16 at 15:50
  • more than 4millions rows, 715MB – Mélanie Aug 25 '16 at 14:54

4 Answers4

0

C# has a limit on how big single objects can be (hence the exception). Consider the fact that, even if every string in your array was 1 byte, 4 million bytes would still be around 4 gigabytes and, as I understand it, the default max size of a single object in .NET is 2 gigabytes. This is true regardless of how much memory you have in your system overall.

There are a couple of articles available on Stack Overflow how to create big arrays: I need very big array length(size) in C# and OutOfMemoryException on declaration of Large Array

As I understand it, this is partially the result of how the .NET framework managed the transition from 32 bits to 64 bits. (Note, too, that 2 gigabytes roughly corresponds to the max value of a 32-bit signed int). In newer versions of .NET (after 4.5 according to what I've read but I've never tried it) I guess you can change the max object size to some degree. There are also special classes you can use (e.g. a custom BigArray class) to get around the space limit.

Keep in mind, too, that an array requires that it be able to allocate consecutive memory addresses (that's why you can do constant-time accesses by index - the address is a constant offset from the pointer to the first item, so the framework can figure out the memory location by multiplying the index by 32 or some other constant depending on memory size and adding it to the address in the pointer to the first item to figure out where the item will be). Thus, fragmentation in memory can reduce the effective amount of memory available for arrays.

Community
  • 1
  • 1
0

You need to create helper class which will cache starting positions of each line in entire file.

 int[] cacheLineStartPos;

 public string GetLine (int lineNumber)
 {
     int linePositionInFile = cacheLineStartPos[lineNumber];

     reader.Position = linePositionInFile;

     return reader.ReadLine();
 }

Of course it just an example, and logic can be more complex.

apocalypse
  • 5,764
  • 9
  • 47
  • 95
  • sorry what is this for? i m not sure to understand the purpose – Mélanie Aug 24 '16 at 15:12
  • @Mélanie: Using this approach you will avoid bad code like: this.lines = new string[nbOfLines][]; this.patternedLines = new string[nbOfLines][]; Which consumes a lot of memory. – apocalypse Aug 24 '16 at 16:32
-1

If you need to process a larger amount of data, consider using databases. They were designed for exactly this kind of purpose. You can also query them afterwards with specific request. Probably a key-value store is sufficient for you already. Take a look at https://ravendb.net/ or https://www.mongodb.com/

Alexander Pacha
  • 9,187
  • 3
  • 68
  • 108
-1

You should not have all your lines in memeory even if you need the user to act on all data, something similar to baretail, you have to read the lines from disk, and just for the window which is visible to the user, when he seeks more data, then you will stream more from disk with the same window width, but never for all lines, think of files like 40 GB ... it will not be practical to have all of them loaded. Here is an example for how to do that, and as requested from other memebers, here is the code from the answer mentioned, credit to @James King

//  This really needs to be a member-level variable;
private static readonly object fsLock = new object();

//  Instantiate this in a static constructor or initialize() method
private static FileStream fs = new FileStream("myFile.txt", FileMode.Open);


public string ReadFile(int fileOffset) {

    byte[] buffer = new byte[bufferSize];

    int arrayOffset = 0;

    lock (fsLock) {
        fs.Seek(fileOffset, SeekOrigin.Begin);

        int numBytesRead = fs.Read(bytes, arrayOffset , bufferSize);

        //  Typically used if you're in a loop, reading blocks at a time
        arrayOffset += numBytesRead;
    }

    // Do what you want to the byte array and return it

}
Community
  • 1
  • 1
Hasson
  • 1,894
  • 1
  • 21
  • 25