I wrote a process in C# which takes a .tar.gz
file extracts its contents, two large text files, ~250+ MB. These two text files are simply comma separated value files. The two text files are loaded into a DataTable
one at a time. After the first file is loaded into a data table, a SQL bulk copy is executed on that table. Then the second file is loaded into a data table and again, a SQL bulk copy is executed on that table. This code runs fine on my machine where resources are plentiful. But when I move the executable over to a batch server, it starts throwing out of memory exceptions. After analyzing/profiling the execution of the code I see that it can reach and exceed 11 GB of memory until it finishes executing. I added some DataTable.Clear()
calls followed by GC.Collect()
. This helped. After the first table is loaded and during the SQL bulk copy memory consumption is ~4 GB. Then it drops to nearly 0. Then when the second file begins loading it consumes ~7 GB of memory and then drops to 0 when the SQL bulk copy is complete. However, I still feel this memory consumption is wickedly high. Especially for loading some simple comma separated files into a data table then to a SQL table. The execution of this takes approximately 5-7 minutes.
Here's a little detail on the two files if necessary.
The first file is 16 columns wide. With approximately 3 million records.
The second file is 4 columns wide. With approximately 1.3 billion records.
The two files both get read into data tables and the data of each column is casted to it's appropriate type, so string
for strings, int
for ints, datetime
for dates and times, etc.
How can I go about getting the memory foot print of this app to be much smaller but still execute in a reasonable time?
Here's the logic that loads the data table and then performs SQL bulk copy.
string query = "SELECT * FROM " + DESTINATIONSCHEMA + ".KEY_TABLE";
DataTable keyNameLookup = new DataTable();
const int FILEA_COLUMN_COUNT = 16;
using (SqlConnection sc = new SqlConnection(CONNECTIONSTRING))
using (SqlCommand cm = new SqlCommand(query, sc)) {
sc.Open();
keyNameLookup.Load(cm.ExecuteReader());
sc.Close();
}
foreach (string file in fileList) {
if (file.Contains("FILE_A")) {
DataTable dt = new DataTable(file);
DataTable dtKeyValues = new DataTable(file + "key_value");
for (int col = 0; col < 4; col++)
dtKeyValues.Columns.Add(new DataColumn());
for (int col = 0; col < FILEA_COLUMN_COUNT + 1; col++)
dt.Columns.Add(new DataColumn());
using (FileStream fs = File.Open(Path.Combine(DOWNLOADDESTINATION, file), FileMode.Open, FileAccess.Read, FileShare.Read))
using (BufferedStream bs = new BufferedStream(fs))
using (StreamReader sr = new StreamReader(bs)) {
string line;
while ((line = sr.ReadLine()) != null) {
List<KeyValuePair<string, string>> kvPairs = new List<KeyValuePair<string, string>>();
var cols = line.Split('~');
DataRow dr = dt.NewRow();
dr[0] = DateTime.Now;
for (int cIndex = 0; cIndex < FILEA_COLUMN_COUNT; cIndex++) {
if (cIndex >= 0 && cIndex <= 6) {
if (cIndex == 5) dr[cIndex + 1] = (cols[cIndex] == "") ? Convert.ToDateTime(loadDate) : Convert.ToDateTime(cols[cIndex]);
else dr[cIndex + 1] = cols[cIndex];
} else if (cIndex > 6) {
if (cIndex == 10) { // key value column
dr[cIndex + 1] = cols[cIndex];
string[] keyValues = cols[cIndex].Split(',');
string[] keyNames = cols[cIndex - 1].Split(',');
if (keyValues.Length == keyNames.Length) {
for (int i = 0; i < keyNames.Length; i++) {
kvPairs.Add(new KeyValuePair<string, string>(keyNames[i], keyValues[i]));
}
}
foreach (KeyValuePair<string, string> kv in kvPairs) {
var res = from lookupRows in keyNameLookup.AsEnumerable()
where lookupRows.Field<string>("KEY_NAME") == kv.Key
select lookupRows;
DataRow drKv = dtKeyValues.NewRow();
drKv[0] = dr[0];
drKv[1] = Convert.ToInt32(dr[1]);
drKv[2] = res.First().Field<Int16>("KEY_NAME_ID");
drKv[3] = kv.Value;
dtKeyValues.Rows.Add(drKv);
}
} else dr[cIndex + 1] = cols[cIndex];
}
}
dt.Rows.Add(dr);
}
using (SqlConnection sc = new SqlConnection(CONNECTIONSTRING)) {
sc.Open();
using (SqlBulkCopy sbc = new SqlBulkCopy(sc)) {
sbc.BulkCopyTimeout = 6000;
sbc.DestinationTableName = DESTINATIONSCHEMA + ".MAIN";
try {
List<int> maxLengths =
Enumerable.Range(0, dt.Columns.Count)
.Select(col => dt.AsEnumerable()
.Select(row => row[col]).OfType<string>()
.Max(val => val.Length)).ToList();
sbc.WriteToServer(dt);
} catch (Exception e) {
Console.WriteLine("Error writing datatable to " + sbc.DestinationTableName + " table in database.");
Console.WriteLine("\t" + e);
}
sbc.DestinationTableName = DESTINATIONSCHEMA + ".KEY_VALUE_TABLE";
try {
sbc.WriteToServer(dtKeyValues);
} catch (Exception e) {
Console.WriteLine("Error writing datatable to " + sbc.DestinationTableName + " table in database.");
Console.WriteLine("\t" + e);
}
}
}
}
dt.Clear();
dt.Dispose();
dtKeyValues.Clear();
dtKeyValues.Dispose();
GC.Collect();
} else if (file.Contains("FILE_B")) {
const int FILEB_COLUMN_COUNT = 4;
DataTable dt = new DataTable(file);
for (int i = 0; i <= FILEB_COLUMN_COUNT; i++)
dt.Columns.Add(new DataColumn());
StreamReader sr = new StreamReader(Path.Combine(DOWNLOADDESTINATION, file));
string line = string.Empty;
while ((line = sr.ReadLine()) != null) {
var cols = line.Split('~');
DataRow dr = dt.NewRow();
dr[0] = DateTime.Now;
for (int cIndex = 0; cIndex < FILEB_COLUMN_COUNT; cIndex++) {
if (cIndex == 0) dr[cIndex + 1] = Convert.ToInt32(cols[cIndex]);
else if (cIndex == 1) dr[cIndex + 1] = Convert.ToInt32(cols[cIndex]);
else dr[cIndex + 1] = cols[cIndex];
}
dt.Rows.Add(dr);
}
sr.Close();
using (SqlConnection sc = new SqlConnection(CONNECTIONSTRING)) {
sc.Open();
using (SqlBulkCopy sbc = new SqlBulkCopy(sc)) {
sbc.BulkCopyTimeout = 6000;
sbc.DestinationTableName = DESTINATIONSCHEMA + ".AUX_TABLE";
try {
sbc.WriteToServer(dt);
} catch (Exception e) {
Console.WriteLine("Error writing datatable to table in database.");
Console.WriteLine("\t" + e);
}
}
}
dt.Clear();
dt.Dispose();
GC.Collect();
}
}
return true;