1

*updated with suggestions, but still taking time... started at 7 days of processing, now its taking 2,5 days. DataTableAdapter access is taking huge time.

I'm neewbie but intensive-researcher in stackoverflow, even so, couldn't find any answers that fit my problem.

I have 80 files, each with 200,000 lines - with little 'standards' or TAG's in format.

I've been able to search through each file, each line, and just replaced a IF-ELSE to a SWITCH-CASE (it improved performance, thanks to stackoverflow forum) and put intensive-stuff into another thread (again stackoverflow user's merit).

Even so, I'm getting 95 minutes per file, witch takes me to a 2,5 days-text-processing, and when deployed, I get a hang GUI (in debug its okay).

The txt file has this standard, with variable lines, :

BARR; --> thats first tag

184071; ALAMEDOS ; 518042,100; 922453,700; --> thats valid information I want

TAGs are (full line on txt): SE; -> CKT; -> BARR; -> TRECH; -> CAP; -> INST; -> KEY; -> REG; -> ET;xxxx; -> EP;xxxx; -> DMD; -->but can skip some "tags" without notice, thats why I'm testing line by line

My problem: - 2,5 days of intensive processing; (critical) - hanging gui after deployment; (not that bad, could solve later)

(thanks in advace!)

My winform click action - calling thread and the backgroundworker with the intensive stuff: (tried to wrap-up because its lenghty..)

`private void Button_extract_element_Click(object sender, EventArgs e)
 {
     TestObject test = new TestObject();
     test._shouldstop = true;
     backgroundWorker1.RunWorkerAsync(test);
     int passes = 0;

     Label_extract_element.Text = "wait processing....";
     Label_extract_element.Refresh();
     Label_extract_element.Update();

     //this should keep winform waiting for thread-return, showing passes
     while (test._shouldstop)
     {
        passes++;
        Label_extract_element.Text = "wait processing...." + passes;
        Label_extract_element.Refresh();
        Label_extract_element.Update();
     }
     Label_extract_element.Text = " OK, done!";
     Label_extract_element.Refresh();
     Label_extract_element.Update();
 } //End of Button_extract_element_Click

 class TestObject
    {
    public bool _shouldstop { get; set; }
    }   

 //backgroundWorker complete actions 
 private void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
    {
        // Receive the result from DoWork, and display it.
        TestObject test = e.Result as TestObject;
    }

 private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
 {
     TestObject argumentTest = e.Argument as TestObject;
     argumentTest._shouldstop = true;
     string loop = "";
     string[] ListOfFilesinDir = Directory.GetFiles(GlobalVariables.folder, "*.txt").Select(Path.GetFileName).ToArray();

     foreach (string filename in ListOfFilesinDir)
     {
        int count_barr = 0;
        int count_lines = 0;
        //ReadAll seems to process really fast - not a gap
        string[] FLines = File.ReadAllLines(GlobalVariables.folder + "\\" + filename);

        int[] line_barr = new int[FLines.Count()];

        foreach (string Lines in FLines)
        {
        count_lines++;
        switch (Lines)
        {
           case "SE;":
           GlobalVariables.SEstr = FLines[count_lines].Split(';')[3].Trim();
           break;

           case "CKT;":
           GlobalVariables.codCktAL = FLines[count_lines].Split(';')[2].Trim();
           GlobalVariables.nomeCktAL = FLines[count_lines].Split(';')[10].Trim();
           GlobalVariables.nomeArqv = filename;
           break;

           case "BARR;": loop = "BARR;"; break;
           case "TRECH;": loop = "TRECH;"; break;
           case "CAP;": loop = "CAP;"; break;
           case "INST;": loop = "INST;"; break;
           case "KEY;": loop = "KEY;"; break;
           case "REG;": loop = "REG;"; break;
           case "DMD;": 
              loop = "DMD;"; 
              GlobalVariables.TRAFO = (FLines[count_lines-8].Split(';')[1].Trim());
              break;
        }

        switch (loop)
        {
           // I'll post just the first case, so I dont go soooo long in this post..
           //This part seems to process really fast

           case "BARR;":
              GlobalVariables.parse_results = "";

              //take next line and test if is one of the nexts TAGs, and break loop:
              GlobalVariables.parse_results += FLines[count_lines];

              if (Equals(GlobalVariables.parse_results, "TRECH;") || Equals(GlobalVariables.parse_results, "CAP;") || Equals(GlobalVariables.parse_results, "INST;") || Equals(GlobalVariables.parse_results, "KEY;") || Equals(GlobalVariables.parse_results, "REG;") || Equals(GlobalVariables.parse_results.Split(';')[0], "ET") || Equals(GlobalVariables.parse_results.Split(';')[0], "EP"))
              {
                 GlobalVariables.parse_results = "";
                 loop = "";
                 break;
              }
              else  //initiates the extraction BARR just have 4 field in txt
              {
                 //save the number of the line to array for later reference
                 count_barr++;
                 line_barr[count_barr] = count_lines;
                 break;
              }
              case "TRECH;": /*repeat all in BARR loop for TRECH's 20 fields*/ break;
              case "CAP;": /*same repeat for different n fields*/ break;
              case "INST;": /*same repeat for different n fields*/ break;
              case "KEY;": /*same repeat for different n fields*/ break;
              case "REG;": /*same repeat for different n fields*/ break;
        } //end of switch
     } //end for each lines

     //Now the TAKING TIME: saving to database - take the line number reference stored

     for (int i = 1; i < (count_barr+1); i++)
     {
        double p0 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[0].Trim());
        string p1 = FLines[line_barr[i]].Split(';')[1].Trim().ToString();
        double p2 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[2].Trim());
        double p3 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[3].Trim());
        barr_MTTableAdapter1.GRAVA(p0, p1, p2 , p3, GlobalVariables.SEstr, GlobalVariables.codCktAL, GlobalVariables.nomeCktAL, GlobalVariables.nomeArqv);
     } 
argumentTest._shouldstop = false;
e.Result = argumentTest;
}`
edgirardi
  • 34
  • 8
  • `File.ReadLines(GlobalVariables.folder + "\\" + filename).Skip(count_lines).Take(1).First();` you are rereading the *whole* file *each* iteration *twice* just to read the next line sequentially? Well no wonder that takes ages. Rewriting that to the usual naive version would probably run in a few seconds and just memory mapping the files and other optimizations would probably bring it down to less than one. – Voo Feb 21 '14 at 15:17
  • Your `while (test._shouldstop)` is probably causing the form to hang...don't use a "holding" loop like that in the main UI thread! As Voo pointed out, reading the entire file multiple times is killing your processing time. Read the entire file ONCE, and then access it multiple times. With files that large, however, you probably shouldn't be reading the entire thing at once anyways. Read it line by line and process it that way instead... – Idle_Mind Feb 21 '14 at 15:24
  • thanks for UI, will remove the while-loop. For the ReadLines issue, I found in this post: [link]http://stackoverflow.com/questions/21873613/read-last-30-000-lines-of-a-file[/link] that ReadLines is faster than ReadAllLines for larger files. With Voo comment, I just saw that I'm doing 2x the same thing in the if-else statment... noob here will fix and measure performance – edgirardi Feb 21 '14 at 15:29
  • @user3337371 On the 1st iteration you read one line, on the 2nd iteration you read 2 lines, on the kth iteration you read k lines of the file. Basically instead of reading n lines, you are reading `n*(n+1)/2` lines. For 200k lines per file this means you are reading not 200k lines but instead about 20 **billion** lines. The difference between using ReadLines or ReadAllLines to read a file once is close to negligible for your small files. – Voo Feb 21 '14 at 15:39
  • So yes I'll take every bet that reading the file only once will finish in maybe a second or two which will be more than fine. I've done similar things in about 200ms with memory mapped files. – Voo Feb 21 '14 at 15:44
  • ok, just replaced the code (edit in original post), now using string [] and ReadAllLines instead of ReadLines. There is a performance gain (11 min reduction per file, gives me 0,6 days). Still, this is the best that one could do to extract txt file? or I'm missing the point that Voo tried to explain ? – edgirardi Feb 21 '14 at 17:27
  • 84 minutes for this code and such a small file? That doesn't sound right. Make sure the DB code is using one large transaction and you're not sending each line separately around. But you really should do some profiling to see what's going on (you again do the split unnecessarily way more often than necessary but there has to be something more off to explain such a long processing time). – Voo Feb 21 '14 at 18:51
  • Definitely look at the database calls. But...reading the entire file into memory at once could still be the bottleneck, especially if resources are pinched and the swap file gets used. It just depends on your system resources. Reading only one line at a time with a StreamReader may be faster...you'd have to test it. – Idle_Mind Feb 21 '14 at 23:36
  • taking suggestions, it improved the performance, modified code updated above. take secs to reach the 'DataAdapter' part where I had to write a loop. So insert bulk data into access seems to take a lot of time. I found this post about DAO that may improve performance of writing to access. will test and update everybody. [link]http://stackoverflow.com/questions/7070011/writing-large-number-of-records-bulk-insert-to-access-in-net-c – edgirardi Feb 25 '14 at 11:46
  • I do inserts into a table on a 5 year old server now at around 100.000 lines per second. An approach involving written SQL is bad in that area - SqlBulkCopy is your friend, into a temp table with a final copy over, and that in multiple threads. You think you are faster but that files should take a low number of minutes to process. Your speeds would be slow in 1990. This is 2013. – TomTom Feb 25 '14 at 21:41
  • Different universe = using Access Database instead of SQL Server or Oracle. no bulk copy for that. – edgirardi Feb 26 '14 at 14:42

2 Answers2

1

Your answer still is problematic. Use the following example to not read all lines into memory:

System.IO.StreamReader file = 
   new System.IO.StreamReader("c:\\test.txt");
while((line = file.ReadLine()) != null)
{
   Console.WriteLine (line);
   counter++;
}

There is no need for a tiny file like yours to take that long. I process files with about half a billion events (granted, those are binary coded, but waaaaaaay more than your 200.000 lines) in minutes. You're wasting lots of time by doing things like allocating an array of lines instead of reading the files line by line.

Brad Larson
  • 170,088
  • 45
  • 397
  • 571
TomTom
  • 61,059
  • 10
  • 88
  • 148
  • Cant vote, but it would take (-1) as this doenst help. ReadAll or StremReader or any read method takes only 0,2 secods. The time problem is related to writing into Access DataBase. – edgirardi Feb 26 '14 at 15:06
0

As stated in question, this answer only applies to MS Access Database, if you use Oracle or SQL Server just launch a Bulk load

Well, after a lot of contributions (see comments above specially from Voo) and a lot of stackoverflow research, I could improve the performance from 7 days to 45 minutes processing total 16 million lines, line-by-line.

The key after well-oriented tips from people in comments, was to use DAO (with some worries about deployment the database with ClickOnce - notice the dbName connection string).

A lot of usefull information can be found here: StackOverflow-Writing-large-records

If you use accdb, you need to do a modification in using ADO to: using DAO = Microsoft.Office.Interop.Access.Dao;
(can be found in Visual Studio add reference in COM Type references, you need to add the Microsoft Office xx.x Access Database Engine Object Library - but remember that this imposes a great limitation to your end-user specs.

I noticed some improvements needed to store all iterations in DAO (BARR, TRECH, so on..) but its code-optimization, not the main issue in this post.

Dont know the reason why .NET doenst add a bulk insert for MS Access.

The code above for each file, takes 0.3 seconds to pass switch statments, and 1.33 minutes to DAO-saving. If doing for all 80 files takes 45 minutes

 private void Button_extract_element_Click(object sender, EventArgs e)
 {
 TestObject test = new TestObject();
 test._shouldstop = true;
 backgroundWorker1.RunWorkerAsync(test);
 int passes = 0;

 Label_extract_element.Text = "wait processing....";
 Label_extract_element.Refresh();
 Label_extract_element.Update();

 //this should keep winform waiting for thread-return, showing passes
 while (test._shouldstop)
 {
    passes++;
    Label_extract_element.Text = "wait processing...." + passes;
    Label_extract_element.Refresh();
    Label_extract_element.Update();
 }
 Label_extract_element.Text = " OK, done!";
 Label_extract_element.Refresh();
 Label_extract_element.Update();
 } //End of Button_extract_element_Click

 class TestObject
 {
 public bool _shouldstop { get; set; }
 }   

 //backgroundWorker complete actions 
 private void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
 {
    // Receive the result from DoWork, and display it.
    TestObject test = e.Result as TestObject;
 }

 private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
 {
 TestObject argumentTest = e.Argument as TestObject;
 argumentTest._shouldstop = true;
 string loop = "";
 string[] ListOfFilesinDir = Directory.GetFiles(GlobalVariables.folder, "*.txt").Select(Path.GetFileName).ToArray();

 foreach (string filename in ListOfFilesinDir)
 {
    int count_barr = 0;
    int count_lines = 0;
    //ReadAll seems to process really fast - not a gap
    string[] FLines = File.ReadAllLines(GlobalVariables.folder + "\\" + filename);

    int[] line_barr = new int[FLines.Count()];

    foreach (string Lines in FLines)
    {
    count_lines++;
    switch (Lines)
    {
       case "SE;":
       GlobalVariables.SEstr = FLines[count_lines].Split(';')[3].Trim();
       break;

       case "CKT;":
       GlobalVariables.codCktAL = FLines[count_lines].Split(';')[2].Trim();
       GlobalVariables.nomeCktAL = FLines[count_lines].Split(';')[10].Trim();
       GlobalVariables.nomeArqv = filename;
       break;

       case "BARR;": loop = "BARR;"; break;
       case "TRECH;": loop = "TRECH;"; break;
       case "CAP;": loop = "CAP;"; break;
       case "INST;": loop = "INST;"; break;
       case "KEY;": loop = "KEY;"; break;
       case "REG;": loop = "REG;"; break;
       case "DMD;": 
          loop = "DMD;"; 
          GlobalVariables.TRAFO = (FLines[count_lines-8].Split(';')[1].Trim());
          break;
    }

    switch (loop)
    {
       // I'll post just the first case, so I dont go soooo long in this post..
       //This part seems to process really fast

       case "BARR;":
          GlobalVariables.parse_results = "";

          //take next line and test if is one of the nexts TAGs, and break loop:
          GlobalVariables.parse_results += FLines[count_lines];

          if (Equals(GlobalVariables.parse_results, "TRECH;") || Equals(GlobalVariables.parse_results, "CAP;") || Equals(GlobalVariables.parse_results, "INST;") || Equals(GlobalVariables.parse_results, "KEY;") || Equals(GlobalVariables.parse_results, "REG;") || Equals(GlobalVariables.parse_results.Split(';')[0], "ET") || Equals(GlobalVariables.parse_results.Split(';')[0], "EP"))
          {
             GlobalVariables.parse_results = "";
             loop = "";
             break;
          }
          else  
          {
             //store the number of the line to array for later reference
             count_barr++;
             line_barr[count_barr] = count_lines;
             break;
          }
          case "TRECH;": /*repeat all in BARR loop for TRECH's 20 fields*/ break;
          case "CAP;": /*same repeat for different n fields*/ break;
          case "INST;": /*same repeat for different n fields*/ break;
          case "KEY;": /*same repeat for different n fields*/ break;
          case "REG;": /*same repeat for different n fields*/ break;
    } //end of switch
 } //end for each lines

string dbName = Application.StartupPath + "\\Resources";
DAO.DBEngine dbEngine = new DAO.DBEngine();
DAO.Database db = dbEngine.OpenDatabase(dbName+"\\DataBase.accdb");

// From here, could work more to store different Tables with different fields, dynamically, improving code

DAO.Recordset rs = db.OpenRecordset("BARRA_MT");   

for (int i = 1; i < (count_barr+1); i++)
{
   rs.AddNew();
   double b0 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[0].Trim());
   string b1 = FLines[line_barr[i]].Split(';')[1].Trim().ToString();
   double b2 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[2].Trim());
   double b3 = Convert.ToDouble(FLines[line_barr[i]].Split(';')[3].Trim());
   rs.Fields["BARR_MT"].Value = b0;
   rs.Fields["COD"].Value = b1;
   rs.Fields["X"].Value = b2;
   rs.Fields["Y"].Value = b3;
   rs.Update();
}
rs.Close();     
db.Close();

argumentTest._shouldstop = false;
e.Result = argumentTest;
} //end`
Community
  • 1
  • 1
edgirardi
  • 34
  • 8
  • Still crappy. There is no need to read all lines. YOu could - and should as was commended - iterate over the lines WHILE THEY ARE READ. – TomTom Feb 25 '14 at 21:37
  • As seen in previous comments, ReadAll or each line (tried both) is doing seconds of difference, and the problem was hours. SQL Bulk copy its okay for Oracle or SQL Server, wich as stated, doenst apply. The Read part takes 0,2 seconds, while saving with DAO takes the rest of the time (disk-access). – edgirardi Feb 26 '14 at 15:04