I made this function that removes all SQL comments, using plain regular expressons. It removes both line comments (even when there is not a linebreak after) and block comments (even if there are nested block comments). This function can also replace literals (useful if you are searching for something inside SQL procedures but you want to ignore strings).
My code was based on this answer (which is about C# comments), so I had to change line comments from "//" to "--", but more importantly I had to rewrite the block comments regex (using balancing groups) because SQL allows nested block comments, while C# doesn't.
Also, I have this "preservePositions" argument, which instead of stripping out the comments it just fills comments with whitespace. That's useful if you want to preserve the original position of each SQL command, in case you need to manipulate the original script while preserving original comments.
Regex everythingExceptNewLines = new Regex("[^\r\n]");
public string RemoveComments(string input, bool preservePositions, bool removeLiterals=false)
{
//based on https://stackoverflow.com/questions/3524317/regex-to-strip-line-comments-from-c-sharp/3524689#3524689
var lineComments = @"--(.*?)\r?\n";
var lineCommentsOnLastLine = @"--(.*?)$"; // because it's possible that there's no \r\n after the last line comment
// literals ('literals'), bracketedIdentifiers ([object]) and quotedIdentifiers ("object"), they follow the same structure:
// there's the start character, any consecutive pairs of closing characters are considered part of the literal/identifier, and then comes the closing character
var literals = @"('(('')|[^'])*')"; // 'John', 'O''malley''s', etc
var bracketedIdentifiers = @"\[((\]\])|[^\]])* \]"; // [object], [ % object]] ], etc
var quotedIdentifiers = @"(\""((\""\"")|[^""])*\"")"; // "object", "object[]", etc - when QUOTED_IDENTIFIER is set to ON, they are identifiers, else they are literals
//var blockComments = @"/\*(.*?)\*/"; //the original code was for C#, but Microsoft SQL allows a nested block comments // //https://msdn.microsoft.com/en-us/library/ms178623.aspx
//so we should use balancing groups // http://weblogs.asp.net/whaggard/377025
var nestedBlockComments = @"/\*
(?>
/\* (?<LEVEL>) # On opening push level
|
\*/ (?<-LEVEL>) # On closing pop level
|
(?! /\* | \*/ ) . # Match any char unless the opening and closing strings
)+ # /* or */ in the lookahead string
(?(LEVEL)(?!)) # If level exists then fail
\*/";
string noComments = Regex.Replace(input,
nestedBlockComments + "|" + lineComments + "|" + lineCommentsOnLastLine + "|" + literals + "|" + bracketedIdentifiers + "|" + quotedIdentifiers,
me => {
if (me.Value.StartsWith("/*") && preservePositions)
return everythingExceptNewLines.Replace(me.Value, " "); // preserve positions and keep line-breaks // return new string(' ', me.Value.Length);
else if (me.Value.StartsWith("/*") && !preservePositions)
return "";
else if (me.Value.StartsWith("--") && preservePositions)
return everythingExceptNewLines.Replace(me.Value, " "); // preserve positions and keep line-breaks
else if (me.Value.StartsWith("--") && !preservePositions)
return everythingExceptNewLines.Replace(me.Value, ""); // preserve only line-breaks // Environment.NewLine;
else if (me.Value.StartsWith("[") || me.Value.StartsWith("\""))
return me.Value; // do not remove object identifiers ever
else if (!removeLiterals) // Keep the literal strings
return me.Value;
else if (removeLiterals && preservePositions) // remove literals, but preserving positions and line-breaks
{
var literalWithLineBreaks = everythingExceptNewLines.Replace(me.Value, " ");
return "'" + literalWithLineBreaks.Substring(1, literalWithLineBreaks.Length - 2) + "'";
}
else if (removeLiterals && !preservePositions) // wrap completely all literals
return "''";
else
throw new NotImplementedException();
},
RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
return noComments;
}
Test 1 (first original, then removing comments, last removing comments/literals)
[select /* block comment */ top 1 'a' /* block comment /* nested block comment */*/ from sys.tables --LineComment
union
select top 1 '/* literal with */-- lots of comments symbols' from sys.tables --FinalLineComment]
[select top 1 'a' from sys.tables
union
select top 1 '/* literal with */-- lots of comments symbols' from sys.tables ]
[select top 1 ' ' from sys.tables
union
select top 1 ' ' from sys.tables ]
Test 2 (first original, then removing comments, last removing comments/literals)
Original:
[create table [/*] /*
-- huh? */
(
"--
--" integer identity, -- /*
[*/] varchar(20) /* -- */
default '*/ /* -- */' /* /* /* */ */ */
);
go]
[create table [/*]
(
"--
--" integer identity,
[*/] varchar(20)
default '*/ /* -- */'
);
go]
[create table [/*]
(
"--
--" integer identity,
[*/] varchar(20)
default ' '
);
go]