I have a python script that is running over 1M lines with different lengths. The script runs very slow. It has been running only over 30000 of them for the last 12 hours. Splitting the file is out of question since the file already split. My code looks like this:
regex1 = re.compile(r"(\{\{.*?\}\})", flags=re.IGNORECASE)
regex2 = re.compile(r"(<ref.*?</ref>)", flags=re.IGNORECASE)
regex3 = re.compile(r"(<ref.*?\/>)", flags=re.IGNORECASE)
regex4 = re.compile(r"(==External links==.*?)", flags=re.IGNORECASE)
regex5 = re.compile(r"(<!--.*?-->)", flags=re.IGNORECASE)
regex6 = re.compile(r"(File:[^ ]*? )", flags=re.IGNORECASE)
regex7 = re.compile(r" [0-9]+ ", flags=re.IGNORECASE)
regex8 = re.compile(r"(\[\[File:.*?\]\])", flags=re.IGNORECASE)
regex9 = re.compile(r"(\[\[.*?\.JPG.*?\]\])", flags=re.IGNORECASE)
regex10 = re.compile(r"(\[\[Image:.*?\]\])", flags=re.IGNORECASE)
regex11 = re.compile(r"^[^_].*(\) )", flags=re.IGNORECASE)
fout = open(sys.argv[2],'a+')
with open(sys.argv[1]) as f:
for line in f:
parts=line.split("\t")
label=parts[0].replace(" ","_").lower()
line=parts[1].lower()
try:
line = regex1.sub("",line )
except:
pass
try:
line = regex2.sub("",line )
except:
pass
try:
line = regex3.sub("",line )
except:
pass
try:
line = regex4.sub("",line )
except:
pass
try:
line = regex5.sub("",line )
except:
pass
try:
line = regex6.sub("",line )
except:
pass
try:
line = regex8.sub("",line )
except:
pass
try:
line = regex9.sub("",line )
except:
pass
try:
line = regex10.sub("",line )
except:
pass
try:
for match in re.finditer(r"(\[\[.*?\]\])", line):
replacement_list=match.group(0).replace("[","").replace("]","").split("|")
replacement_list = [w.replace(" ","_") for w in replacement_list]
replacement_for_links=' '.join(replacement_list)
line = line.replace(match.group(0),replacement_for_links)
except:
pass
try:
line = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', line, flags=re.MULTILINE)
except:
pass
try:
line = line.translate(None, '!"#$%&\'*+,./:;<=>?@[\\]^`{|}~')
except:
pass
try:
line = line.replace(' (',' ')
line=' '.join([word.rstrip(")") if not '(' in word else word for word in line.split(" ")])
line=re.sub(' isbn [\w-]+ ',' ' ,line)
line=re.sub(' [p]+ [\w-]+ ',' ' ,line)
line = re.sub( ' \d+ ', ' ', line)
line= re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", line)
line = re.sub( '\s+', ' ', line).strip()
line=re.sub(' isbn [\w-]+ ',' ' ,line)
except:
pass
out_string=label+"\t"+line
fout.write(out_string)
fout.write("\n")
fout.close()
Is there any change that I can gain significant improvement over the current version?
UPDATE 1: After profiling using the suggestion by @fearless_fool I realized that regex3 and regex9 and http removal are the least efficient ones.
UPDATE 2: It's just interesting to find out that using .*
adds lot more to the steps of the regex patterns. I tried to replace that with [^X]*
where X
is something I know it never happens in the string. It improves about 20x for 1000 long lines. For example now regex1 is regex1 = re.compile(r"(\{\{[^\}]*?\}\})", flags=re.IGNORECASE)
.... If I want to use two characters in negative matching, I don't know how to do it. For example if I want to change (\{\{[^\}]*?\}\})
to (\{\{[^\}\}]*?\}\})
which I know know is wrong since any word in []
is considered as separate characters.