16

I am trying to remove a large number of files from a location (by large I mean over 100000), whereby the action is initated from a web page. Obviously I could just use

string[] files = System.IO.Directory.GetFiles("path with files to delete");
foreach (var file in files) {
    IO.File.Delete(file);
}

Directory.GetFiles http://msdn.microsoft.com/en-us/library/wz42302f.aspx

This method has already been posted a few times: How to delete all files and folders in a directory? and Delete files from directory if filename contains a certain word

But the problem with this method is that if you have say a hundred thousand files it becomes a performance issue as it has to generate all of the filepaths first before looping through them.

Added to this if a web page is waiting a response from a method which is performing this as you can imagine it will look a bit rubbish!

One thought I had was to wrap this up in an an asychrnonous web service call and when it completes it fires back a response to the web page to say that they have been removed? Maybe put the delete method in a separate thread? Or maybe even use a seperate batch process to perform the delete?

I have a similar issue when trying to count the number of files in a directory - if it contains a large number of files.

I was wondering if this is all a bit overkill? I.e. is there a simpler method to deal with this? Any help would be appreciated.

Community
  • 1
  • 1
Aim Kai
  • 2,934
  • 1
  • 22
  • 34
  • 1
    I am not sure about C#, but generally its not a good idea to have large number of files in a single directory. – Sands Feb 02 '10 at 16:47
  • You've mentioned it yourself, `asynchronous` is the key word. – Pool Feb 02 '10 at 16:51
  • To "Sands" - trust me I didn't make that decision!! :) – Aim Kai Feb 02 '10 at 16:53
  • @Sands: I think that's precisely why it would be useful to have a performant way of *deleting* a large number of files in a single directory, in case you do find yourself in that position. – Dan Tao Feb 02 '10 at 17:31
  • @Sands sometimes you have to deal with legacy code or existing environment. – labilbe Aug 21 '21 at 17:32

10 Answers10

12
  1. GetFiles is extremely slow.
  2. If you are invoking it from a website, you might just throw a new Thread which does this trick.
  3. An ASP.NET AJAX call that returns whether there are still matching files, can be used to do basic progress updates.

Below an implementation of a fast Win32 wrapping for GetFiles, use it in combination with a new Thread and an AJAX function like: GetFilesUnmanaged(@"C:\myDir", "*.txt*).GetEnumerator().MoveNext().

Usage

Thread workerThread = new Thread(new ThreadStart((MethodInvoker)(()=>
{    
     foreach(var file in GetFilesUnmanaged(@"C:\myDir", "*.txt"))
          File.Delete(file);
})));
workerThread.Start();
//just go on with your normal requests, the directory will be cleaned while the user can just surf around

   public static IEnumerable<string> GetFilesUnmanaged(string directory, string filter)
        {
            return new FilesFinder(Path.Combine(directory, filter))
                .Where(f => (f.Attributes & FileAttributes.Normal) == FileAttributes.Normal
                    || (f.Attributes & FileAttributes.Archive) == FileAttributes.Archive)
                .Select(s => s.FileName);
        }
    }


public class FilesEnumerator : IEnumerator<FoundFileData>
{
    #region Interop imports

    private const int ERROR_FILE_NOT_FOUND = 2;
    private const int ERROR_NO_MORE_FILES = 18;

    [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Auto)]
    private static extern IntPtr FindFirstFile(string lpFileName, out WIN32_FIND_DATA lpFindFileData);

    [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Auto)]
    private static extern bool FindNextFile(SafeHandle hFindFile, out WIN32_FIND_DATA lpFindFileData);

    #endregion

    #region Data Members

    private readonly string _fileName;
    private SafeHandle _findHandle;
    private WIN32_FIND_DATA _win32FindData;

    #endregion

    public FilesEnumerator(string fileName)
    {
        _fileName = fileName;
        _findHandle = null;
        _win32FindData = new WIN32_FIND_DATA();
    }

    #region IEnumerator<FoundFileData> Members

    public FoundFileData Current
    {
        get
        {
            if (_findHandle == null)
                throw new InvalidOperationException("MoveNext() must be called first");

            return new FoundFileData(ref _win32FindData);
        }
    }

    object IEnumerator.Current
    {
        get { return Current; }
    }

    public bool MoveNext()
    {
        if (_findHandle == null)
        {
            _findHandle = new SafeFileHandle(FindFirstFile(_fileName, out _win32FindData), true);
            if (_findHandle.IsInvalid)
            {
                int lastError = Marshal.GetLastWin32Error();
                if (lastError == ERROR_FILE_NOT_FOUND)
                    return false;

                throw new Win32Exception(lastError);
            }
        }
        else
        {
            if (!FindNextFile(_findHandle, out _win32FindData))
            {
                int lastError = Marshal.GetLastWin32Error();
                if (lastError == ERROR_NO_MORE_FILES)
                    return false;

                throw new Win32Exception(lastError);
            }
        }

        return true;
    }

    public void Reset()
    {
        if (_findHandle.IsInvalid)
            return;

        _findHandle.Close();
        _findHandle.SetHandleAsInvalid();
    }

    public void Dispose()
    {
        _findHandle.Dispose();
    }

    #endregion
}

public class FilesFinder : IEnumerable<FoundFileData>
{
    readonly string _fileName;
    public FilesFinder(string fileName)
    {
        _fileName = fileName;
    }

    public IEnumerator<FoundFileData> GetEnumerator()
    {
        return new FilesEnumerator(_fileName);
    }

    IEnumerator IEnumerable.GetEnumerator()
    {
        return GetEnumerator();
    }
}

public class FoundFileData
{
    public string AlternateFileName;
    public FileAttributes Attributes;
    public DateTime CreationTime;
    public string FileName;
    public DateTime LastAccessTime;
    public DateTime LastWriteTime;
    public UInt64 Size;

    internal FoundFileData(ref WIN32_FIND_DATA win32FindData)
    {
        Attributes = (FileAttributes)win32FindData.dwFileAttributes;
        CreationTime = DateTime.FromFileTime((long)
                (((UInt64)win32FindData.ftCreationTime.dwHighDateTime << 32) +
                 (UInt64)win32FindData.ftCreationTime.dwLowDateTime));

        LastAccessTime = DateTime.FromFileTime((long)
                (((UInt64)win32FindData.ftLastAccessTime.dwHighDateTime << 32) +
                 (UInt64)win32FindData.ftLastAccessTime.dwLowDateTime));

        LastWriteTime = DateTime.FromFileTime((long)
                (((UInt64)win32FindData.ftLastWriteTime.dwHighDateTime << 32) +
                 (UInt64)win32FindData.ftLastWriteTime.dwLowDateTime));

        Size = ((UInt64)win32FindData.nFileSizeHigh << 32) + win32FindData.nFileSizeLow;
        FileName = win32FindData.cFileName;
        AlternateFileName = win32FindData.cAlternateFileName;
    }
}

/// <summary>
/// Safely wraps handles that need to be closed via FindClose() WIN32 method (obtained by FindFirstFile())
/// </summary>
public class SafeFindFileHandle : SafeHandleZeroOrMinusOneIsInvalid
{
    [DllImport("kernel32.dll", SetLastError = true)]
    private static extern bool FindClose(SafeHandle hFindFile);

    public SafeFindFileHandle(bool ownsHandle)
        : base(ownsHandle)
    {
    }

    protected override bool ReleaseHandle()
    {
        return FindClose(this);
    }
}

// The CharSet must match the CharSet of the corresponding PInvoke signature
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Auto)]
public struct WIN32_FIND_DATA
{
    public uint dwFileAttributes;
    public FILETIME ftCreationTime;
    public FILETIME ftLastAccessTime;
    public FILETIME ftLastWriteTime;
    public uint nFileSizeHigh;
    public uint nFileSizeLow;
    public uint dwReserved0;
    public uint dwReserved1;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
    public string cFileName;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
    public string cAlternateFileName;
}
Jan Jongboom
  • 26,598
  • 9
  • 83
  • 120
  • If it takes a long time, the HTTP request can still timeout, though. – Neil Barnwell Feb 02 '10 at 16:51
  • I think the idea of wrapping an unmanaged chunk of code is certainly one way to go. But I still have a problem with working out if the process has finished or not. I guess I could put this in an web service call! Thanks for the response though Jan - I'll have a look at this code.. :) – Aim Kai Feb 02 '10 at 17:07
  • You can determine whether the process is finished, using an ASP.NET AJAX call that calls `GetFilesUnmanaged(@"C:\myDir", "*.txt").GetEnumerator().MoveNext()`; it's a very cheap call in contrary to default `GetFiles`, and if it returns something; the process hasn't finished yet :-). – Jan Jongboom Feb 03 '10 at 06:08
4

Can you put all your files in the same directory?

If so, why don't you just call Directory.Delete(string,bool) on the subdir you want to delete?

If you've already got a list of file paths you want to get rid of, you might actually get better results by moving them to a temp dir then deleting them rather than deleting each file manually.

Cheers, Florian

Florian Doyon
  • 4,146
  • 1
  • 27
  • 37
  • Would I have to use the System.IO.Directory.GetFiles() method to get all the files I have to move? as in the following example? http://msdn.microsoft.com/en-us/library/cc148994.aspx or http://www.eggheadcafe.com/community/aspnet/2/63950/moving-files-from-one-fol.aspx This would just cause the same performance issue I was talking about above wouldn't it? I guess alternatively I could use a script such as rmdir /q /s - might be worth looking into? – Aim Kai Feb 02 '10 at 19:20
  • I think it would cause a perf slowdown, but not as drastic as deleting all the files one by one. Moving a file is very cheap, deleting it not so, so you should still gain some perfs by moving the files to the directory that you will then delete. The best approach would be to actually create the files in the same directory in the first place, if you can find any way to group the files according to the way they're going to be deleted when you get them. – Florian Doyon Feb 02 '10 at 21:29
  • Yep agree with you on the performance difference between moving and deleting files. Unfortunately the creation of the files is not directly under my control at the moment.. – Aim Kai Feb 03 '10 at 08:51
2

Having more than 1000 files in a directory is a huge problem.

If you are in the development stages now, you should consider putting in an algo which will put the files into a random folder (inside your root folder) with a surety of the number of files in that folder to be under 1024.

Something like

public UserVolumeGenerator()
    {
        SetNumVolumes((short)100);
        SetNumSubVolumes((short)1000);
        SetVolumesRoot("/var/myproj/volumes");
    }

    public String GenerateVolume()
    {
        int volume = random.nextInt(GetNumVolumes());
        int subVolume = random.nextInt(GetNumSubVolumes());

        return Integer.toString(volume) + "/" + Integer.toString(subVolume);
    }

    private static final Random random = new Random(System.currentTimeMillis());

While doing this, also make sure that each time you create a file, add it to a HashMap or list simultaneously (the path). Periodically serialize this using something like JSON.net to the filesystem(integrity’s sake, so that even if your service fails, you can get back the file list from the serialized form).

When you want to clean up the files or query among them, first do a lookup of this HashMap or list and then act on the file. This is better than System.IO.Directory.GetFiles

Cherian
  • 19,107
  • 12
  • 55
  • 69
2

Some improvements to speed it up in the back end:

  • Use Directory.EnumerateFiles(..) : this will iterate through files without waiting after all files have been retrieved.

  • Use Parallel.Foreach(..) : this will delete files simultaneously.

It should be faster but apparently the HTTP request would still be timeout with the large number of files so the back end process should be executed in separate worker thread and notify result back to web client after finishing.

Minh Nguyen
  • 2,106
  • 1
  • 28
  • 34
1

Do it in a separate thread, or post a message to a queue (maybe MSMQ?) where another application (maybe a windows service) is subscribed to that queue and performs the commands (i.e. "Delete e:\dir*.txt") in it's own process.

The message should probably just include the folder name. If you use something like NServiceBus and transactional queues, then you can post your message and return immediately as long as the message was posted successfully. If there is a problem actually processing the message, then it'll retry and eventually go on an error queue that you can watch and perform maintenance on.

Neil Barnwell
  • 41,080
  • 29
  • 148
  • 220
  • Yep definitely separate thread!! :) I like your idea about using MSMQ! Will investigate and reply back! – Aim Kai Feb 02 '10 at 17:10
  • No, I don't recommend using another thread on an IIS app pool, I recommend a totally separate process, where you use something like MSMQ (i.e. with NServiceBus) to send that process a message telling it to perform the deletion. If you use NSB and transactional MSMQ queues, then you have safety all the way through that the message has been processed. – Neil Barnwell Feb 03 '10 at 00:23
  • Sorry I misunderstood you.. :)! – Aim Kai Feb 03 '10 at 08:48
0

Boot the work out to a worker thread and then return your response to the user.

I'd flag up a application variable to say that you are doing "the big delete job" to stop running multiple threads doing the same work. You could then poll another page which could give you a progress update of the number of files removed so far too if you wanted to?

Just a query but why so many files?

Pete Duncanson
  • 3,208
  • 2
  • 25
  • 35
  • 100k files is not much, i currently work on an application that shuffles around 2-3 million files that are (by spec) splitted into directories of 100k-150k files. rsync requires 60 minutes for a dry run. –  Feb 02 '10 at 16:50
  • Its alot to be doing via a link/button on a website is all I meant :) – Pete Duncanson Feb 03 '10 at 17:21
0

You could create a simple ajax webmethod in your aspx code behind and call it with javascript.

Steve Danner
  • 21,818
  • 7
  • 41
  • 51
0

The best choice (imho) would be to create a seperate process to delete/count the files and check on the progress by polling otherwise you might get problems with browser timeouts.

0

Wow. I think you are definitely on the right track with having some other service or entity taking care of the delete. In doing so you could also provide methods for tracking the process of the delete and showing the result to the user using asynch javascript.

As others have said putting this in another process is a great idea. You do not want IIS hogging resources using such long running jobs. Another reason for doing so is security. You might not want to give your work process that ability to delete files from the disk.

smaclell
  • 4,568
  • 7
  • 41
  • 49
0

I know it's old thread but in addition to Jan Jongboom answer I propose similar solution which is quite performant and more universal. My solution was built to quickly remove directory structure in DFS with support for long file names (>255 chars). The first difference is in DLL import declaration.

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
static extern IntPtr FindFirstFile(string lpFileName, ref WIN32_FIND_DATA lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
static extern bool FindNextFile(IntPtr hDindFile, ref WIN32_FIND_DATA lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
[return: MashalAs(UnmanagedType.Bool]
static extern bool DeleteFile(string lpFileName)

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
[return: MashalAs(UnmanagedType.Bool]
static extern bool DeleteDirectory(string lpPathName)

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
static extern bool FindClose(IntPtr hFindFile);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLAstError = true)]
static extern uint GetFileAttributes(string lpFileName);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLAstError = true)]
static extern bool SetFileAttributes(string lpFileName, uint dwFileAttributes);

WIN32_FIND_DATA structure is also slightly different:

    [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode), Serializable, BestFitMapping(false)]
    internal struct WIN32_FIND_DATA
    {
        internal FileAttributes dwFileAttributes;
        internal FILETIME ftCreationTime;
        internal FILETIME ftLastAccessTime;
        internal FILETIME ftLastWriteTime;
        internal int nFileSizeHigh;
        internal int nFileSizeLow;
        internal int dwReserved0;
        internal int dwReserved1;
        [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
        internal string cFileName;
        [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
        internal string cAlternative;
    }

In order to use long paths the path needs to be prepared as follows:

public void RemoveDirectory(string directoryPath)
{
    var path = @"\\?\UNC\" + directoryPath.Trim(@" \/".ToCharArray());
    SearchAndDelete(path);
}

and here's the main method:

private void SearchAndDelete(string path)
{
    var fd = new WIN32_FIND_DATA();
    var found = false;
    var handle = IntPtr.Zero;
    var invalidHandle = new IntPtr(-1);
    var fileAttributeDir = 0x00000010;
    var filesToRemove = new List<string>();
    try
    {
        handle = FindFirsFile(path + @"\*", ref fd);
        if (handle == invalidHandle) return;
        do
        {
            var current = fd.cFileName;
            if (((int)fd.dwFileAttributes & fileAttributeDir) != 0)
            {
                if (current != "." && current != "..")
                {
                    var newPath = Path.Combine(path, current);
                    SearchAndDelete(newPath);
                }
            }
            else
            {
                filesToRemove.Add(Path.Combine(path, current));
            }
            found = FindNextFile(handle, ref fd);
        } while (found);
    }
    finally
    {
        FindClose(handle);
    }
    try
    {
        object lockSource = new Object();
        var exceptions = new List<Exception>();
        Parallel.ForEach(filesToRemove, file, =>
        {
            var attrs = GetFileAttributes(file);
            attrs &= ~(uint)0x00000002; // hidden
            attrs &= ~(uint)0x00000001; // read-only
            SetFileAttributes(file, attrs);
            if (!DeleteFile(file))
            {
                var msg = string.Format("Cannot remove file {0}.{1}{2}", file.Replace(@"\\?\UNC", @"\"), Environment.NewLine, new Win32Exception(Marshal.GetLastWin32Error()).Message);
                lock(lockSource)
                {
                    exceptions.Add(new Exceptions(msg));
                }
            }
        });
        if (exceptions.Any())
        {
            throw new AggregateException(exceptions);
        }
    }
    var dirAttr = GetFileAttributes(path);
    dirAttr &= ~(uint)0x00000002; // hidden
    dirAttr &= ~(uint)0x00000001; // read-only
    SetfileAttributtes(path, dirAttr);
    if (!RemoveDirectory(path))
    {
        throw new Exception(new Win32Exception(Marshal.GetLAstWin32Error()));
    }
}

of course we could go further and store directories in separate list outside of that method and delete them later in another method which could look like this:

private void DeleteDirectoryTree(List<string> directories)
{
        // group directories by depth level and order it by level descending
        var data = directories.GroupBy(d => d.Split('\\'),
            d => d,
            (key, dirs) => new
            {
                Level = key,
                Directories = dirs.ToList()
            }).OrderByDescending(l => l.Level);
        var exceptions = new List<Exception>();
        var lockSource = new Object();
        foreach (var level in data)
        {
            Parallel.ForEach(level.Directories, dir =>
            {
                var attrs = GetFileAttributes(dir);
                attrs &= ~(uint)0x00000002; // hidden
                attrs &= ~(uint)0x00000001; // read-only
                SetFileAttributes(dir, attrs);
                if (!RemoveDirectory(dir))
                {
                    var msg = string.Format("Cannot remove directory {0}.{1}{2}", dir.Replace(@"\\?\UNC\", string.Empty), Environment.NewLine, new Win32Exception(Marshal.GetLastWin32Error()).Message);
                    lock (lockSource)
                    {
                        exceptions.Add(new Exception(msg));
                    }
                }
            });
        }
        if (exceptions.Any())
        {
            throw new AggregateException(exceptions);
        }
}