I have this code to remove commnents in python code. using regex pattern, especially for multilines. I try to test it on some string but not comments, it seems failing on specific cases.
import re, sys
def removeComments(text):
""" remove comments.
text: blob of text with comments (can include newlines)
returns: text with comments removed
"""
pattern = r"""
## --------- COMMENT ---------
#.*?$ ## Start of # .... comment
| ##
''' ## Start of ''' ... ''' comment
[^*]*\*+ ## Non-* followed by 1-or-more *'s
( ##
[^/*][^*]*\*+ ##
)* ## 0-or-more things which don't start with /
## but do end with '*'
/ ## End of /* ... */ comment
| ## -OR- various things which aren't comments:
( ##
## ------ " ... " STRING ------
" ## Start of " ... " string
( ##
\\. ## Escaped char
| ## -OR-
[^"\\] ## Non "\ characters
)* ##
" ## End of " ... " string
| ## -OR-
##
## ------ ' ... ' STRING ------
' ## Start of ' ... ' string
( ##
\\. ## Escaped char
| ## -OR-
[^'\\] ## Non '\ characters
)* ##
' ## End of ' ... ' string
| ## -OR-
##
## ------ ANYTHING ELSE -------
. ## Anything other char
[^/"'\\]* ## Chars which doesn't start a comment, string
) ## or escape
"""
regex = re.compile(pattern, re.VERBOSE|re.MULTILINE|re.DOTALL)
noncomments = [m.group(2) for m in regex.finditer(text) if m.group(2)]
return "".join(noncomments)
def commentRemover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
filename = 'test.py'
with open(filename) as f:
# uncmtFile = removeComments(f.read())
uncmtFile = commentRemover(f.read())
print uncmtFile
But; it fails to recognize string with comments:
createTableStatements = (
'''create table a1 (
c1 varchar(128) primary key,
c2 varchar(128),
c3 integer
)''',
)
variable5 = {"a": \
"""This text is not a comment
t"""}
Is there a way to catch those special cases in regex ? Or is it better to parse with a state variable ?