210

I recently have been moving a bunch of MP3s from various locations into a repository. I had been constructing the new file names using the ID3 tags (thanks, TagLib-Sharp!), and I noticed that I was getting a System.NotSupportedException:

"The given path's format is not supported."

This was generated by either File.Copy() or Directory.CreateDirectory().

It didn't take long to realize that my file names needed to be sanitized. So I did the obvious thing:

public static string SanitizePath_(string path, char replaceChar)
{
    string dir = Path.GetDirectoryName(path);
    foreach (char c in Path.GetInvalidPathChars())
        dir = dir.Replace(c, replaceChar);

    string name = Path.GetFileName(path);
    foreach (char c in Path.GetInvalidFileNameChars())
        name = name.Replace(c, replaceChar);

    return dir + name;
}

To my surprise, I continued to get exceptions. It turned out that ':' is not in the set of Path.GetInvalidPathChars(), because it is valid in a path root. I suppose that makes sense - but this has to be a pretty common problem. Does anyone have some short code that sanitizes a path? The most thorough I've come up with this, but it feels like it is probably overkill.

    // replaces invalid characters with replaceChar
    public static string SanitizePath(string path, char replaceChar)
    {
        // construct a list of characters that can't show up in filenames.
        // need to do this because ":" is not in InvalidPathChars
        if (_BadChars == null)
        {
            _BadChars = new List<char>(Path.GetInvalidFileNameChars());
            _BadChars.AddRange(Path.GetInvalidPathChars());
            _BadChars = Utility.GetUnique<char>(_BadChars);
        }

        // remove root
        string root = Path.GetPathRoot(path);
        path = path.Remove(0, root.Length);

        // split on the directory separator character. Need to do this
        // because the separator is not valid in a filename.
        List<string> parts = new List<string>(path.Split(new char[]{Path.DirectorySeparatorChar}));

        // check each part to make sure it is valid.
        for (int i = 0; i < parts.Count; i++)
        {
            string part = parts[i];
            foreach (char c in _BadChars)
            {
                part = part.Replace(c, replaceChar);
            }
            parts[i] = part;
        }

        return root + Utility.Join(parts, Path.DirectorySeparatorChar.ToString());
    }

Any improvements to make this function faster and less baroque would be much appreciated.

Soner Gönül
  • 97,193
  • 102
  • 206
  • 364
Jason Sundram
  • 12,225
  • 19
  • 71
  • 86
  • possible duplicate of [How to remove illegal characters from path and filenames?](http://stackoverflow.com/questions/146134/how-to-remove-illegal-characters-from-path-and-filenames) – Manuel Mar 07 '13 at 08:25

14 Answers14

369

To clean up a file name you could do this

private static string MakeValidFileName( string name )
{
   string invalidChars = System.Text.RegularExpressions.Regex.Escape( new string( System.IO.Path.GetInvalidFileNameChars() ) );
   string invalidRegStr = string.Format( @"([{0}]*\.+$)|([{0}]+)", invalidChars );

   return System.Text.RegularExpressions.Regex.Replace( name, invalidRegStr, "_" );
}
Richard Ev
  • 52,939
  • 59
  • 191
  • 278
Andre
  • 4,560
  • 2
  • 21
  • 27
  • 1
    "Remarks: The array returned from this method is not guaranteed to contain the complete set of characters that are invalid in file and directory names." Source: [Path.GetInvalidFileNameChars Method](http://msdn.microsoft.com/en-us/library/system.io.path.getinvalidfilenamechars.aspx) – Mark Byers Oct 19 '11 at 11:28
  • 22
    Great method. Don't forget though that reserved words will still bite you, and you will be left scratching your head. Source: [Wikipedia Filename reserved words](http://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words) – Spud May 28 '12 at 11:47
  • 10
    Periods are invalid characters if they are at the end of the file name so `GetInvalidFileNameChars` does not include them. It does not throw a exception in windows, it just strips them off, but it could cause unexpected behavior if you are expecting the period to be there. I modified the regex to handle that case to cause `.` to be considered one of the invalid characters if it is at the end of the string. – Scott Chamberlain Nov 26 '12 at 17:23
  • Trailing periods may be invalid, but leading periods are valid. For example, Apache web server uses [.htaccess](http://www.htaccess-guide.com) configurations files, which Windows Explorer erroneously says are invalid (but can be named that way via command prompt). – Elaskanator Oct 07 '19 at 13:48
  • Note that the escaping rules are different in square brackets: https://stackoverflow.com/a/10593427/880990. – Olivier Jacot-Descombes Apr 13 '23 at 14:55
178

A shorter solution:

var invalids = System.IO.Path.GetInvalidFileNameChars();
var newName = String.Join("_", origFileName.Split(invalids, StringSplitOptions.RemoveEmptyEntries) ).TrimEnd('.');
DenNukem
  • 8,014
  • 3
  • 40
  • 45
  • 4
    This is better than the top answer especially for ASP.NET Core which might return different characters based on platform. – Alexei - check Codidact May 22 '19 at 12:57
  • 2
    Why did you add `.TrimEnd('.')`? – Emanuele Jun 15 '22 at 09:28
  • Emanuele, because a trailing dot is invalid (at least in Windows) – Graeme Wicksted Dec 09 '22 at 17:28
  • Note that this will likely cause unintended side-effects if running cross-platform. The invalid file name characters include forward slashes `/`, which are perfectly valid on most non-Windows platforms. – Dave Jarvis Feb 23 '23 at 23:25
  • @Dave Jarvis, the character `/` is a path-separator on Unix-derived systems, and as such is forbidden in _file_ names - in fact, '/' and `\0` (NUL) are the only byte-values that _cannot_ be put in the filename field of directory entry. But, when sanitising file names for storage, I prefer to use the strictest criteria, and remove anything that is invalid on any OS that the file is likely to live on... or is likely to later be copied onto, so that means colons and backslashes (which are valid on Unixy-systems) must go as well – KrisW Apr 11 '23 at 16:53
  • No luck with this. From: D:\\Users\\Richard.Bianco\\Files\\richard.bianco\\1692136939290\\importtest.txt To: D_Users_Richard.Bianco_Files_richard.bianco_1692136939290_importtest.txt – Rich Bianco Aug 15 '23 at 22:17
98

Based on Andre's excellent answer but taking into account Spud's comment on reserved words, I made this version:

/// <summary>
/// Strip illegal chars and reserved words from a candidate filename (should not include the directory path)
/// </summary>
/// <remarks>
/// http://stackoverflow.com/questions/309485/c-sharp-sanitize-file-name
/// </remarks>
public static string CoerceValidFileName(string filename)
{
    var invalidChars = Regex.Escape(new string(Path.GetInvalidFileNameChars()));
    var invalidReStr = string.Format(@"[{0}]+", invalidChars);

    var reservedWords = new []
    {
        "CON", "PRN", "AUX", "CLOCK$", "NUL", "COM0", "COM1", "COM2", "COM3", "COM4",
        "COM5", "COM6", "COM7", "COM8", "COM9", "LPT0", "LPT1", "LPT2", "LPT3", "LPT4",
        "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"
    };

    var sanitisedNamePart = Regex.Replace(filename, invalidReStr, "_");
    foreach (var reservedWord in reservedWords)
    {
        var reservedWordPattern = string.Format("^{0}\\.", reservedWord);
        sanitisedNamePart = Regex.Replace(sanitisedNamePart, reservedWordPattern, "_reservedWord_.", RegexOptions.IgnoreCase);
    }

    return sanitisedNamePart;
}

And these are my unit tests

[Test]
public void CoerceValidFileName_SimpleValid()
{
    var filename = @"thisIsValid.txt";
    var result = PathHelper.CoerceValidFileName(filename);
    Assert.AreEqual(filename, result);
}

[Test]
public void CoerceValidFileName_SimpleInvalid()
{
    var filename = @"thisIsNotValid\3\\_3.txt";
    var result = PathHelper.CoerceValidFileName(filename);
    Assert.AreEqual("thisIsNotValid_3__3.txt", result);
}

[Test]
public void CoerceValidFileName_InvalidExtension()
{
    var filename = @"thisIsNotValid.t\xt";
    var result = PathHelper.CoerceValidFileName(filename);
    Assert.AreEqual("thisIsNotValid.t_xt", result);
}

[Test]
public void CoerceValidFileName_KeywordInvalid()
{
    var filename = "aUx.txt";
    var result = PathHelper.CoerceValidFileName(filename);
    Assert.AreEqual("_reservedWord_.txt", result);
}

[Test]
public void CoerceValidFileName_KeywordValid()
{
    var filename = "auxillary.txt";
    var result = PathHelper.CoerceValidFileName(filename);
    Assert.AreEqual("auxillary.txt", result);
}
Bryan Watts
  • 44,911
  • 16
  • 83
  • 88
fiat
  • 15,501
  • 9
  • 81
  • 103
  • 2
    Minor suggestion since it looks like the method was going this direction: Add a this keyword and it becomes a handy extension method. public static String CoerceValidFileName(this String filename) – Ryan McArthur Feb 22 '19 at 16:49
  • 4
    Small bug: this method doesn't change reserved words without file extensions (eg. `COM1`), which are also disallowed. Suggested fix would be to change the reservedWordPattern to `"^{0}(\\.|$)"` and the replacement string to `"_reservedWord_$1"` – Dehalion Mar 07 '19 at 14:54
  • 1
    This fails for `Clock$.` as `reservedWordPattern` needs to be passed through `Regex.Escape()` – Alex K. Mar 20 '21 at 15:21
39
string clean = String.Concat(dirty.Split(Path.GetInvalidFileNameChars()));
data
  • 2,563
  • 1
  • 21
  • 25
9

there are a lot of working solutions here. just for the sake of completeness, here's an approach that doesn't use regex, but uses LINQ:

var invalids = Path.GetInvalidFileNameChars();
filename = invalids.Aggregate(filename, (current, c) => current.Replace(c, '_'));

Also, it's a very short solution ;)

kappadoky
  • 321
  • 2
  • 12
6

I wanted to retain the characters in some way, not just simply replace the character with an underscore.

One way I thought was to replace the characters with similar looking characters which are (in my situation), unlikely to be used as regular characters. So I took the list of invalid characters and found look-a-likes.

The following are functions to encode and decode with the look-a-likes.

This code does not include a complete listing for all System.IO.Path.GetInvalidFileNameChars() characters. So it is up to you to extend or utilize the underscore replacement for any remaining characters.

private static Dictionary<string, string> EncodeMapping()
{
    //-- Following characters are invalid for windows file and folder names.
    //-- \/:*?"<>|
    Dictionary<string, string> dic = new Dictionary<string, string>();
    dic.Add(@"\", "Ì"); // U+OOCC
    dic.Add("/", "Í"); // U+OOCD
    dic.Add(":", "¦"); // U+00A6
    dic.Add("*", "¤"); // U+00A4
    dic.Add("?", "¿"); // U+00BF
    dic.Add(@"""", "ˮ"); // U+02EE
    dic.Add("<", "«"); // U+00AB
    dic.Add(">", "»"); // U+00BB
    dic.Add("|", "│"); // U+2502
    return dic;
}

public static string Escape(string name)
{
    foreach (KeyValuePair<string, string> replace in EncodeMapping())
    {
        name = name.Replace(replace.Key, replace.Value);
    }

    //-- handle dot at the end
    if (name.EndsWith(".")) name = name.CropRight(1) + "°";

    return name;
}

public static string UnEscape(string name)
{
    foreach (KeyValuePair<string, string> replace in EncodeMapping())
    {
        name = name.Replace(replace.Value, replace.Key);
    }

    //-- handle dot at the end
    if (name.EndsWith("°")) name = name.CropRight(1) + ".";

    return name;
}

You can select your own look-a-likes. I used the Character Map app in windows to select mine %windir%\system32\charmap.exe

As I make adjustments through discovery, I will update this code.

Valamas
  • 24,169
  • 25
  • 107
  • 177
  • 1
    note that there are many characters that look more similar to those, like the [fullwidth form](https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms) `!"#$%&'()*+,-./:;<=>?@{|}~` or other forms of them like `/` SOLIDUS and ` ⁄ ` FRACTION SLASH that can be used directly in filenames without problem – phuclv Jan 04 '19 at 02:40
  • 1
    Glad to see an answer that addresses the risk of having duplicates files when the same pattern around differents invalid chars is used. I adapted this solution by encoding the file name with ASCII markers (0x000). – Larry Feb 08 '21 at 14:04
  • What is CropRight? – Valerio Gentile Mar 28 '23 at 02:09
5

I'm using the System.IO.Path.GetInvalidFileNameChars() method to check invalid characters and I've got no problems.

I'm using the following code:

foreach( char invalidchar in System.IO.Path.GetInvalidFileNameChars())
{
    filename = filename.Replace(invalidchar, '_');
}
Bridge
  • 29,818
  • 9
  • 60
  • 82
André Leal
  • 186
  • 1
  • 8
2

I have had success with this in the past.

Nice, short and static :-)

    public static string returnSafeString(string s)
    {
        foreach (char character in Path.GetInvalidFileNameChars())
        {
            s = s.Replace(character.ToString(),string.Empty);
        }

        foreach (char character in Path.GetInvalidPathChars())
        {
            s = s.Replace(character.ToString(), string.Empty);
        }

        return (s);
    }
Helix 88
  • 701
  • 6
  • 19
2

I think the problem is that you first call Path.GetDirectoryName on the bad string. If this has non-filename characters in it, .Net can't tell which parts of the string are directories and throws. You have to do string comparisons.

Assuming it's only the filename that is bad, not the entire path, try this:

public static string SanitizePath(string path, char replaceChar)
{
    int filenamePos = path.LastIndexOf(Path.DirectorySeparatorChar) + 1;
    var sb = new System.Text.StringBuilder();
    sb.Append(path.Substring(0, filenamePos));
    for (int i = filenamePos; i < path.Length; i++)
    {
        char filenameChar = path[i];
        foreach (char c in Path.GetInvalidFileNameChars())
            if (filenameChar.Equals(c))
            {
                filenameChar = replaceChar;
                break;
            }

        sb.Append(filenameChar);
    }

    return sb.ToString();
}
Dour High Arch
  • 21,513
  • 29
  • 75
  • 90
1

Here's an efficient lazy loading extension method based on Andre's code:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace LT
{
    public static class Utility
    {
        static string invalidRegStr;

        public static string MakeValidFileName(this string name)
        {
            if (invalidRegStr == null)
            {
                var invalidChars = System.Text.RegularExpressions.Regex.Escape(new string(System.IO.Path.GetInvalidFileNameChars()));
                invalidRegStr = string.Format(@"([{0}]*\.+$)|([{0}]+)", invalidChars);
            }

            return System.Text.RegularExpressions.Regex.Replace(name, invalidRegStr, "_");
        }
    }
}
Bryan Legend
  • 6,790
  • 1
  • 59
  • 60
0

Your code would be cleaner if you appended the directory and filename together and sanitized that rather than sanitizing them independently. As for sanitizing away the :, just take the 2nd character in the string. If it is equal to "replacechar", replace it with a colon. Since this app is for your own use, such a solution should be perfectly sufficient.

Brian
  • 25,523
  • 18
  • 82
  • 173
0

For .NET7+ projects it's also possible to use extensions methods with generated regexes like this:

public static class IOExtensions {
    [GeneratedRegex("CON|PRN|AUX|CLOCK\\$|NUL|COM0|COM1|COM2|COM3|COM4|COM5|COM6|COM7|COM8|COM9|LPT0|LPT1|LPT2|LPT3|LPT4|LPT5|LPT6|LPT7|LPT8|LPT9", RegexOptions.Compiled | RegexOptions.IgnoreCase)]
    private static partial Regex GetReservedFilenamesRegex();

    public static string ToEscapedFilename(this string name, string replacer = "_") {
        return GetReservedFilenamesRegex().Replace(
            string.Join(
                replacer,
                name.Split(
                    Path.GetInvalidFileNameChars(),
                    StringSplitOptions.RemoveEmptyEntries
                )
            ),
            replacer
        );
    }
}

For example,

"Order * for AUX at 12/03/2023.csv".ToEscapedFileName()

Will return

Order _ for _ at 12_03_2023.csv

-1

Based @fiat's and @Andre's approach, I'd like to share my solution too. Main difference:

  • its an extension method
  • regex is compiled at first use to save some time with a lot executions
  • reserved words are preserved
public static class StringPathExtensions
{
    private static Regex _invalidPathPartsRegex;
    
    static StringPathExtensions()
    {
        var invalidReg = System.Text.RegularExpressions.Regex.Escape(new string(Path.GetInvalidFileNameChars()));
        _invalidPathPartsRegex = new Regex($"(?<reserved>^(CON|PRN|AUX|CLOCK\\$|NUL|COM0|COM1|COM2|COM3|COM4|COM5|COM6|COM7|COM8|COM9|LPT0|LPT1|LPT2|LPT3|LPT4|LPT5|LPT6|LPT7|LPT8|LPT9))|(?<invalid>[{invalidReg}:]+|\\.$)", RegexOptions.Compiled);
    }

    public static string SanitizeFileName(this string path)
    {
        return _invalidPathPartsRegex.Replace(path, m =>
        {
            if (!string.IsNullOrWhiteSpace(m.Groups["reserved"].Value))
                return string.Concat("_", m.Groups["reserved"].Value);
            return "_";
        });
    }
}
greg-e
  • 374
  • 4
  • 18
-1
using System;
using System.IO;
using System.Linq;
using System.Text;

public class Program
{
    public static void Main()
    {
        try
        {
            var badString = "ABC\\DEF/GHI<JKL>MNO:PQR\"STU\tVWX|YZA*BCD?EFG";
            Console.WriteLine(badString);
            Console.WriteLine(SanitizeFileName(badString, '.'));
            Console.WriteLine(SanitizeFileName(badString));
        }
        catch (Exception ex)
        {
            Console.WriteLine(ex.ToString());
        }
    }

    private static string SanitizeFileName(string fileName, char? replacement = null)
    {
        if (fileName == null) { return null; }
        if (fileName.Length == 0) { return ""; }

        var sb = new StringBuilder();
        var badChars = Path.GetInvalidFileNameChars().ToList();

        foreach (var @char in fileName)
        {
            if (badChars.Contains(@char)) 
            {
                if (replacement.HasValue)
                {
                    sb.Append(replacement.Value);
                }
                continue; 
            }
            sb.Append(@char);
        }
        return sb.ToString();
    }
}
Ralf
  • 1