0

This app processes image files that meet certain criteria, namely that the resolution is larger than 1600X1600. There can be upwards of 8000 files in a source directory tree but not all of them meet the resolution criteria. The tree can be 4 or 5 levels deep, at least.

I've "tasked" the actual conversion process. However, I can't tell when the last task has finished.

I really don't want to create a task array because it would contains thousands of files that don't meet the resolution criteria. And it seems a waste to open the image file, check the resolution, add or not to the task array, and then reopen it again when it's time to process the file.

Here's the code.

using System;
using System.Data;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace ResizeImages2
{
    public partial class Form1 : Form
    {
        private string _destDir;
        private string _sourceDir;
        private StreamWriter _logfile;
        private int _numThreads;

        private void button1_Click(object sender, EventArgs e)
        {
            _numThreads = 0;
            if (_logfile is null)
                _logfile = new StreamWriter("c:\\temp\\imagesLog.txt", append: true);

            _destDir = "c:\\inetpub\\wwwroot\\mywebsite\\";
            _soureDir = "c:\\images\\"
            Directory.CreateDirectory(_destDir);
            var root = new DirectoryInfo(sourceDir);
            WalkDirectoryTree(root); // <--async so it's going to return before all the threads are complete. Can't close the logfile here
            //_logfile.Close();
        }

        private async void WalkDirectoryTree(System.IO.DirectoryInfo root)
        {
            System.IO.FileInfo[] files = null;
            System.IO.DirectoryInfo[] subDirs = null;
            files = root.GetFiles("*-*-*.jpg"); //looks like a sku and is a jpg.
            if (files == null) return;
            foreach (System.IO.FileInfo fi in files)
            {
                _numThreads++;
                await Task.Run(() =>
                {
                    CreateImage(fi);
                    _numThreads--;
                    return true;
                });
            }

            // Now find all the subdirectories under this directory.
            subDirs = root.GetDirectories();

            foreach (System.IO.DirectoryInfo dirInfo in subDirs)
            {
                WalkDirectoryTree(dirInfo);
            }
        }

        private void CreateImage(FileSystemInfo f)
        {
            var originalBitmap = new Bitmap(f.FullName);
            if (originalBitmap.Width <= 1600 || originalBitmap.Height <= 1600) return;
            using (var bm = new Bitmap(1600, 1600))
            {
                Point[] points =
                {
                    new Point(0, 0),
                    new Point(new_wid, 0),
                    new Point(0, new_hgt),
                };
                using (var gr = Graphics.FromImage(bm))
                {
                    gr.DrawImage(originalBitmap, points);
                }

                bm.SetResolution(96, 96);
                bm.Save($"{_destDir}{f.Name}", ImageFormat.Jpeg);
                bm.Dispose();
                originalBitmap.Dispose();
            }
            _logfile.WriteLine(f.Name);
        }
    }
}

UPDATE -- final result

    private async void button1_Click(Object sender, EventArgs e)
    {
        _logfile = new StreamWriter("c:\\temp\\imagesLog.txt", append: true);

        _sourceDir=$"c:\\sourceImages\\";
        _destDir = $"c:\\inetpub\\wwwroot\\images\\";
        var jpgFiles = Directory.EnumerateFiles(_sourceDir, "*-*-*.jpg", SearchOption.AllDirectories);
        var myPOpts = new ParallelOptions {MaxDegreeOfParallelism = 10};
        await Task.Run(() => //allows the user interface to be updated -- usually, file being processed
        {
            Parallel.ForEach(jpgFiles,myPOpts, f=>
            {
                CreateImage(new FileInfo(f));
            });
            _logfile.Close();
        });
    }
    private void CreateImage(FileSystemInfo f)
    {
        var originalBitmap = new Bitmap(f.FullName);
        if (originalBitmap.Width <= 1600 || originalBitmap.Height <= 1600)
        {
            originalBitmap.Dispose();
            return;
        }
        tbCurFile.Text = f.FullName;
        var new_wid = 1600;
        var new_hgt = 1600;
        using (var bm = new Bitmap(new_wid, new_hgt))
        {
            Point[] points =
            {
                new Point(0, 0),
                new Point(new_wid, 0),
                new Point(0, new_hgt),
            };
            var scount = 1;
            var saved = false;
            //why the while/try/catch? Because we are copying files across the internet from a dropbbox in Sync Only Mode.
            //this means we are only working with a pointer until we actually open the file and sometimes 
            //the app gets ahead of itself. It tries to draw the new image before the original file is completely open
            //so let's see if can't  mitigate that here.
            while (!saved && scount < 5)
            {
                try
                {
                    using (var gr = Graphics.FromImage(bm))
                    {
                        gr.DrawImage(originalBitmap, points);
                    }

                    saved = true;
                }
                catch
                {
                    scount++;
                }
            }

            bm.SetResolution(96, 96);
            scount = 1;
            saved = false;
            while (!saved && scount<5)
            {
                try
                {
                    bm.Save($"{_destDir}{f.Name}", ImageFormat.Jpeg);
                    saved = true;
                }
                catch
                {
                    scount++;
                }
            }                
            bm.Dispose();
            originalBitmap.Dispose();
        }
        _logfile.WriteLine($"{_collectionId}\\{f.Name}");
    }
Chris
  • 650
  • 7
  • 20
  • You should use `Parallel.ForEach` instead as it will partition the tasks rather than trying to run them all at once, and you know when it's finished. See https://stackoverflow.com/questions/19102966/parallel-foreach-vs-task-run-and-task-whenall – Ian Mercer Apr 25 '21 at 14:43
  • That requires creating the array before processing the tasks. That's a huge waste. I have to open the file, check the resolution, close and dispose and add to the array if it meets the criteria. Then, the task would again open the file. Why open the file twice? – Chris Apr 25 '21 at 14:53
  • 2
    You don't have to create an array. You can use any `IEnumerable` including a class that `yield`s tasks/files/whatever – asaf92 Apr 25 '21 at 15:15
  • 1
    You can't `await` in `Parallel.ForEach` anyway, so it's won't solve your problem. But `await Task.Run(() =>` inside foreach` that won't be awaited also super bad solution, you will create 8000 tasks and kill your app performance. Instead you need to enqueue your files to thread-safe queue, create finite number of worker threads (4? 8?) and give them to process all files. When all workers go idle because the queue is empty, you are done. It's called [Producer-Consumer pattern](https://www.dotnetcurry.com/dotnetcore/1509/async-dotnetcore-pattern) – Artur Apr 25 '21 at 15:19
  • 1
    Here's another option for handling async inside parallel execution with limits: https://devblogs.microsoft.com/pfxteam/implementing-a-simple-foreachasync/ – Ian Mercer Apr 25 '21 at 15:30
  • asaf92: That still requires opening the source file twice. Once to check if it should be added to the task array/IEnumerable or whatever, and once when the source file is processed. – Chris Apr 25 '21 at 15:31
  • Only because you've mixed up the concerns of walking the tree and processing the files. Separate those concerns into (i) a recursive method that returns file paths and (ii) a method that processes a single file path. See also: https://learn.microsoft.com/en-us/dotnet/api/system.io.directory.enumeratefiles?view=net-5.0 – Ian Mercer Apr 25 '21 at 15:33
  • Ian,asaf92...I think I see what you're saying. I'm stuck on having the app process files while the traversal is happening. It takes a long time to traverse this directory structure but it doesn't take any cpu. I wanted the conversion tasks to run while traversing the tree. That's where I'm stuck. – Chris Apr 25 '21 at 15:40
  • 1
    Docs: "The EnumerateFiles and GetFiles methods differ as follows: When you use EnumerateFiles, you can start enumerating the collection of names before the whole collection is returned. When you use GetFiles, you must wait for the whole array of names to be returned before you can access the array." – Ian Mercer Apr 25 '21 at 15:54
  • 2
    Thank you both! It was as simple as this: var jpgFiles = Directory.EnumerateFiles(fbd.SelectedPath, "*-*-*.jpg", SearchOption.AllDirectories); Parallel.ForEach(jpgFiles, f => { CreateImage(new FileInfo(f)); }); – Chris Apr 25 '21 at 15:55
  • What have noticed now is the app taking up 15-20 gigabytes of memory instead a gig or so. – Chris Apr 25 '21 at 16:06
  • 1
    You can use `new ParallelOptions { MaxDegreeOfParallelism = ... },` to limit the concurrency. – Ian Mercer Apr 25 '21 at 16:08
  • @IanMercer not only you *can* configure the `MaxDegreeOfParallelism`, but you [*should*](https://stackoverflow.com/questions/66261010/multiple-parallel-foreach-loops-in-net/66263583#66263583) configure it, unless you are OK with having a saturated `ThreadPool`. The [documentation](https://learn.microsoft.com/en-us/dotnet/api/system.threading.tasks.paralleloptions.maxdegreeofparallelism) suggests otherwise, but IMHO letting the `MaxDegreeOfParallelism` have its default `-1` value (meaning unbounded parallelism) is a bad suggestion. – Theodor Zoulias Apr 25 '21 at 16:14
  • `Parallel.ForEach()` is a good solution. But the literal answer to your question is to use a counter. See answers using that technique in the duplicates. – Peter Duniho Apr 25 '21 at 16:30
  • As a side note, you may get better performance if you save the resized images in a different physical storage than the one containing the original images. Reading and writing concurrently to the same physical storage is unlikely to scale well, especially if it's a classic hard disk. – Theodor Zoulias Apr 25 '21 at 16:34
  • The other issue is that this way locks up the user interface. The original code allowed the interface to be updated -- display the current file, for example. – Chris Apr 25 '21 at 17:08
  • You can launch a single `Task` to run the `Parallel.ForEach`. – Ian Mercer Apr 25 '21 at 17:15
  • So, not only did it let the interface update, it sped up the whole process immensely. And I don't know why. – Chris Apr 25 '21 at 17:30

1 Answers1

0

One quick and dirty way of handling that is to use interlocked operations with a shared variable.


    private int _numberOfOutstandingOperations;
    
    private async void button1_Click(object sender, EventArgs e)
    {
        Interlocked.Exchange(ref _numberOfOutstandingOperations, 1);
        
        await WalkDirectoryTree(root);
        CompleteOperation();
    }
    
    
    private async Task WalkDirectoryTree(System.IO.DirectoryInfo root)
    {
        // ...   
        foreach (System.IO.FileInfo fi in files)
        {
            Interlocked.Increment(ref _numberOfOutstandingOperations);
            _ = Task.Run(() =>
            {
                CreateImage(fi);
                CompleteOperation();
            });
        }
        
        foreach (System.IO.DirectoryInfo dirInfo in subDirs)
        {
            await WalkDirectoryTree(dirInfo);
        }
    }
    
    private void CompleteOperation()
    {
        if (Interlocked.Decrement(ref _numberOfOutstandingOperations) == 0)
        {
            // Everything is done, signal a mutex or complete a TaskCompletionSource or whatever
        }
    }

The trick is to initialize the number of operations to 1, so that it completes only when all tasks are finished and all directories have been walked.

Kevin Gosse
  • 38,392
  • 3
  • 78
  • 94