I can use the dictionary to find the frequency of an element in a string array. But when the number of elements in an array reaches around 50 million, the RAM usage for my program is around 8-9 GB. This is way too high compared to what I expected.
My dictionary is a Dictionary<string, int>
, 50 million key-value pairs (if there is no duplicate key) will only cost me around 2 - 2.5GB.
I wonder where I was wrong.
The part to get the frequency of elements:
public IEnumerable<string> GetTopTenStrings(string path)
{
// Dictionary to store the result
Dictionary<string, int> result = new Dictionary<string, int>();
var txtFiles = Directory.EnumerateFiles(Path, "*.dat");
int i = 1;
foreach (string currentFile in txtFiles)
{
using (FileStream fs = File.Open(currentFile, FileMode.Open,
FileAccess.Read, FileShare.Read))
using (BufferedStream bs = new BufferedStream(fs))
using (StreamReader buffer = new StreamReader(bs))
{
Console.WriteLine("Now we are at the file {0}", i);
// Store processed lines
string storage = buffer.ReadToEnd();
Process(result, storage);
i++;
}
}
// Sort the dictionary and return the needed values
var final = result.OrderByDescending(x => x.Value).Take(10);
foreach (var element in final)
{
Console.WriteLine("{0} appears {1}", element.Key, element.Value);
}
var test = final.Select(x => x.Key).ToList();
Console.WriteLine('\n');
return test;
}
The function to add key value to the dictionary:
public void Process(Dictionary<string, int> dict, string storage)
{
List<string>lines = new List<string>();
string[] line = storage.Split(";");
foreach (var item in line.ToList())
{
if(item.Trim().Length != 0)
{
if (dict.ContainsKey(item.ToLower()))
{
dict[item.ToLower()]++;
}
else
{
dict.Add(item.ToLower(), 1);
}
}
}
}