I am creating a Windows Form application, where I select a folder that contains multiple *.txt files. Their length may vary from few thousand lines (kB) to up to 50 milion lines (1GB). Every line of the code has three informations. Date in long, location id in int and value in float all separated by semicolon (;). I need to calculate min and max value in all those files and tell in which file it is, and then the most frequent value.
I already have these files verified and stored in an arraylist. I am opening a thread to read the files one by one and I read the data by line. It works fine, but when there are 1GB files, I run out of memory. I tried to store the values in dictionary, where key would be the date and the value would be an object that contains all the info loaded from the line alongside with the filename. I see I cannot use a dictionary, because at about 6M values, I ran out of memory. So I should probably do it in multithread. I though I could run two threads, one that reads the file and puts the info in some kind of container and the other that reads from it and makes calculations and then deletes the values from the container. But I don't know which container could do such thing. Moreover I need to calculate the most frequent value, so they need to be stored somewhere which leads me back to some kind of dictionary, but I already know I will run out of memory. I don't have much experience with threads either, so I don't know what is possible. Here is my code so far:
GUI:
namespace STI {
public partial class GUI : Form {
private String path = null;
public static ArrayList txtFiles;
public GUI() {
InitializeComponent();
_GUI1 = this;
}
//I run it in thread. I thought I would run the second
//one here that would work with the values inputed in some container
private void buttonRun_Click(object sender, EventArgs e) {
ThreadDataProcessing processing = new ThreadDataProcessing();
Thread t_process = new Thread(processing.runProcessing);
t_process.Start();
//ThreadDataCalculating calculating = new ThreadDataCalculating();
//Thread t_calc = new Thread(calculating.runCalculation());
//t_calc.Start();
}
}
}
ThreadProcessing.cs
namespace STI.thread_package {
class ThreadDataProcessing {
public static Dictionary<long, object> finalMap = new Dictionary<long, object>();
public void runProcessing() {
foreach (FileInfo file in GUI.txtFiles) {
using (FileStream fs = File.Open(file.FullName.ToString(), FileMode.Open))
using (BufferedStream bs = new BufferedStream(fs))
using (StreamReader sr = new StreamReader(bs)) {
String line;
String[] splitted;
try {
while ((line = sr.ReadLine()) != null) {
splitted = line.Split(';');
if (splitted.Length == 3) {
long date = long.Parse(splitted[0]);
int location = int.Parse(splitted[1]);
float value = float.Parse(splitted[2], CultureInfo.InvariantCulture);
Entry entry = new Entry(date, location, value, file.Name);
if (!finalMap.ContainsKey(entry.getDate())) {
finalMap.Add(entry.getDate(), entry);
}
}
}
GUI._GUI1.update("File \"" + file.Name + "\" completed\n");
}
catch (FormatException ex) {
GUI._GUI1.update("Wrong file format.");
}
catch (OutOfMemoryException) {
GUI._GUI1.update("Out of memory");
}
}
}
}
}
}
and the object in which I put the values from lines: Entry.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace STI.entities_package {
class Entry {
private long date;
private int location;
private float value;
private String fileName;
private int count;
public Entry(long date, int location, float value, String fileName) {
this.date = date;
this.location = location;
this.value = value;
this.fileName = fileName;
this.count = 1;
}
public long getDate() {
return date;
}
public int getLocation() {
return location;
}
public String getFileName() {
return fileName;
}
}
}