I want to process some files with maximum throughput. The paths to files are saved in a database. I need to get file paths from the database, change their status to processing, process them, then change their status to either completed or failed.
Currently, I get the files in batches (of 100 files) in order to decrease the number of queries done and process them in parallel (with a degree of parallelism of 10). But in this way, I am losing throughput towards the end of the batch. When there are less than 10 files remaining in the batch the degree of parallelism is not 10 anymore, it decreases.
Here is what I have:
private async Task CopyPendingFilesAsync(SourcePath sourcePath, Options options)
{
var batchIndex = 0;
while (true)
{
var fileBatch = _sourceFileService.GetSourceFileBatchBySourcePathId(
sourcePath.Id, _dataSourceExportConfig.FileCopyBatchSize, Status.Pending);
if (fileBatch.Count == 0)
return;
await SetInProgressStatusForBatch(fileBatch)
.ConfigureAwait(false);
fileBatch
.AsParallel()
.WithDegreeOfParallelism(_dataSourceExportConfig.FileCopyDegreeOfParallelism)
.ForAll(file => ProcessFile(file, destinationBase, options));
await _sourceFileService
.UpdateSourceFilesStatusAsync(fileBatch)
.ConfigureAwait(false);
batchIndex++;
}
}
private async Task SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
file.Status = Status.InProgress;
await _sourceFileService
.UpdateSourceFilesStatusAsync(fileBatch)
.ConfigureAwait(false);
}
private void ProcessFile(
SourceFile file,
string destinationBase,
Options options)
{
try
{
//do something ...
file.Status = Status.Success;
file.ExceptionMessage = null;
}
catch (Exception ex)
{
_logger.Error(ex);
file.Status = Status.Failed;
file.ExceptionMessage = ex.Message;
}
}
How can I maximize the throughput? I read about the producer-consumer pattern with BlockingCollection, TPL Dataflow, and Rx and I am pretty sure that what I want to achieve can be implemented with any of the above, but I wasn't able to do it so far. With the producer-consumer pattern, my producer is extremely fast compared to the consumer, with TPL Dataflow I got stuck with the BatchBlock and I haven't tried Rx. Could someone please point me in the right direction?
Update: Here is a minimal, complete and verifiable example:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;
namespace ConsoleApp1
{
internal static class Program
{
private static void Main()
{
Console.WriteLine("Processing files");
var stopWatch = new Stopwatch();
stopWatch.Start();
var fileService = new FileService();
fileService.ProcessPendingFiles();
foreach (var sourceFile in fileService.SourceFiles)
{
Console.WriteLine($"{sourceFile.Id} {sourceFile.Status}");
}
Console.WriteLine(stopWatch.Elapsed);
Console.ReadLine();
}
}
public class FileService
{
private const int BatchSize = 100;
private const int DegreeOfParallelism = 10;
//this SourceFiles property replaces the Sqlite database where the data is actually stored
public ICollection<SourceFile> SourceFiles =
Enumerable
.Range(0, 1000)
.Select(i =>
new SourceFile
{
Id = i,
Path = "source file path",
Status = Status.Pending,
})
.ToList();
public void ProcessPendingFiles()
{
while (true)
{
var fileBatch = GetSourceFileBatch(BatchSize, Status.Pending);
if (fileBatch.Count == 0)
return;
SetInProgressStatusForBatch(fileBatch);
fileBatch
.AsParallel()
.WithDegreeOfParallelism(DegreeOfParallelism)
.ForAll(ProcessFile);
UpdateSourceFiles(fileBatch);
}
}
private ICollection<SourceFile> GetSourceFileBatch(int batchSize, Status status)
=> SourceFiles
.Where(sf => sf.Status == status)
.Take(batchSize)
.ToList();
//set status to in progress for all files in the batch
//and save the changes to database
//in the application this is actually done with a bulk update and the method is async
private void SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
{
file.Status = Status.InProgress;
var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
sourceFile.Status = file.Status;
}
}
//set status and exception messages for all files in the batch
//and save the changes to database
//in the application this is actually done with a bulk update and the method is async
private void UpdateSourceFiles(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
{
var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
sourceFile.Status = file.Status;
sourceFile.ExceptionMessage = file.ExceptionMessage;
}
}
private void ProcessFile(SourceFile file)
{
try
{
//do something ...
Thread.Sleep(20);
file.Status = Status.Success;
file.ExceptionMessage = null;
}
catch (Exception ex)
{
file.Status = Status.Failed;
file.ExceptionMessage = ex.Message;
}
}
}
public class SourceFile
{
public int Id { get; set; }
public string Path { get; set; }
public Status Status { get; set; }
public string ExceptionMessage { get; set; }
}
public enum Status
{
Pending,
InProgress,
Success,
Failed,
}
}