After launching some tests, I can say that str.translate()
is the best variant.
Input data:
symbols = {"`", "~", "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "-", "+", "=", "{", "[", "]", "}", "|", "\\", ":", ";", "\"", "<", ",", ">", ".", "?", "/"}
translate_table = {126: None, 93: None, 91: None, 125: None, 92: None, 42: None, 45: None, 94: None, 62: None, 47: None, 35: None, 59: None, 44: None, 58: None, 60: None, 124: None, 61: None, 36: None, 95: None, 43: None, 96: None, 123: None, 64: None, 33: None, 38: None, 63: None, 46: None, 34: None, 41: None, 37: None, 40: None}
regular_expression = "[`~!@#$%^&*()_\-+={[\]}|\\:;\"<,>.?/]"
small_document = "Some**r@an]]\"dom t##xt"
normal_document = "TbsX^Kt$FZ%haZe+sLxu:Al\"xNAL\\Kix[mHp_gn]PrG`DqGd~GdNc;BoEq.SYD?Rp>ukq,UfO<XdTc=RUH}oifc&oP!CB*me@Qv{Qf-Li)gmXL/IQH#mne(Khaj|"
big_document = "QOfY+dymyoGBAxTAoIeM+jEWlaECUZEUXuMvprJOqFtQR*OiHtTFZkUNbYipSTTDPOVkIdGTcjWrQmbmthKBHBSEOZ)lQAIJOrVgmGGFdtqbuFfj<Dls<JWtKczAFMPYMemiJBJHdPeeul\\x>lGIBvUsxBokagvVovrrdxdKMtAKx>MEexYv>DGqPUXYaBQKwiSIUobrPQYjilhHMQunE;RiqOZPTnyOEgRrpxcuobvvmGkFpTqgMxYYhrmRRnauiqgvCmZ\"UauceaXsgAMSakxewzPrlIrYkVCVZaEGh]qiizYyzbkcHPF@qQsQMfHPDEbEnWtrCFoARUYAloOcctqmL@hegZbfhsHaJOxOxzQhZAVjVDgokosATfhKMT!WYyPWKcKAHKCzQGGJOCglYGZbftsuyntXZUKNqgGlsLJqgN,pUcOoA/tStXFXgpoSErgvw/OUMPWjJwt=bhMAIDayOZXJm=ifYYUuAvSIZjwnBfktNvEvZmvQso%HiNZEVqoDR%nQBtCkhjSfVfDuRSRsvp-sCunjDDUYSEVLICQdisxhEfqkUTkiPlLiUNNwrvO#WTDmweZyMeIbgNXkIsvaJeHYXV(HvRcGNZM(PPRIAyyLWivGiqMVBtwObqLfEEISyyjGNEdUU:ys`dXcVawkIEAjFXky`RUXNTm`LDM}mwTOcmsSo}haJXPnkwOhKLYwve}SWifzKq}grw}fMSQXXWguUQtlWpPZQymR^wBKEyolFlZnzEEmehSNenOqDOHWRit[Npm?R?DIPXAmQYYBbmJofxUzzWBsVCoPI?VmpXhoMxCfXyHEHowXzIJvExThiffLhBTtma_jk_NrbkPCGGypXvOuBqBxDYfC{bwIHoaqnJSKytxwWXBNnKG~PKuQklGblEwH~rJoGpKZmm~tTEFnPLdmzfrqJibMYIykzL$RZLPmsZjB$AAbZwFnByOydEOIfFvTaEQaSjbpeBZuUGY&ZfPQgLihmPYrhZxSwMzLrNF.WjFiDCLyXksdkLeMHVCfrdgCAotElQ|"
no_match_document = "XOtasggWqhtSLJpHEGoCmMRepFBlRfAGKTLPcEtKonFVsPgvWgAbvJVeMWILPgLapwAmTgXWVbxOJtUFmMygzIqYPqyAxzwElTFyYcGdtnNa"
Code:
def func1(doc):
for c in symbols:
doc = doc.replace(c, "")
return doc
def func2(doc):
return doc.translate(translate_table)
def func3(doc):
return re.sub(regular_expression, "", doc)
def func4(doc):
return "".join(c for c in doc if c not in symbols)
Test results:
func1(small_document): 0.701037002
func1(normal_document): 1.1260866900000002
func1(big_document): 3.4234831459999997
func1(no_match_document): 0.7740780450000004
func2(small_document): 0.14135037500000003
func2(normal_document): 0.5368806810000004
func2(big_document): 0.8128472860000002
func2(no_match_document): 0.394245089
func3(small_document): 0.3157141610000007
func3(normal_document): 0.927359323000001
func3(big_document): 1.9310377590000005
func3(no_match_document): 0.18656399199999996
func4(small_document): 0.3034549070000008
func4(normal_document): 1.3695875739999988
func4(big_document): 10.115730064
func4(no_match_document): 1.2086623230000022
UPD.
Input data I've provided have been "prepared" specially for pure method testing.
To generate translate_table
I've used next dict comprehension:
translate_table = {ord(s): None for s in symbols}
Here is link to website for regex validation (it could be helpful).
In case if you want to recalculate tests by yourself, here is code:
if __name__ == '__main__':
import timeit
print("func1(small_document)", timeit.timeit("func1(small_document)", setup="from __main__ import func1, small_document", number=100000))
print("func1(normal_document): ", timeit.timeit("func1(normal_document)", setup="from __main__ import func1, normal_document", number=100000))
print("func1(big_document): ", timeit.timeit("func1(big_document)", setup="from __main__ import func1, big_document", number=100000))
print("func1(no_match_document): ", timeit.timeit("func1(no_match_document)", setup="from __main__ import func1, no_match_document", number=100000))
print("func2(small_document): ", timeit.timeit("func2(small_document)", setup="from __main__ import func2, small_document", number=100000))
print("func2(normal_document): ", timeit.timeit("func2(normal_document)", setup="from __main__ import func2, normal_document", number=100000))
print("func2(big_document): ", timeit.timeit("func2(big_document)", setup="from __main__ import func2, big_document", number=100000))
print("func2(no_match_document): ", timeit.timeit("func2(no_match_document)", setup="from __main__ import func2, no_match_document", number=100000))
print("func3(small_document): ", timeit.timeit("func3(small_document)", setup="from __main__ import func3, small_document", number=100000))
print("func3(normal_document): ", timeit.timeit("func3(normal_document)", setup="from __main__ import func3, normal_document", number=100000))
print("func3(big_document): ", timeit.timeit("func3(big_document)", setup="from __main__ import func3, big_document", number=100000))
print("func3(no_match_document): ", timeit.timeit("func3(no_match_document)", setup="from __main__ import func3, no_match_document", number=100000))
print("func4(small_document): ", timeit.timeit("func4(small_document)", setup="from __main__ import func4, small_document", number=100000))
print("func4(normal_document): ", timeit.timeit("func4(normal_document)", setup="from __main__ import func4, normal_document", number=100000))
print("func4(big_document): ", timeit.timeit("func4(big_document)", setup="from __main__ import func4, big_document", number=100000))
print("func4(no_match_document): ", timeit.timeit("func4(no_match_document)", setup="from __main__ import func4, no_match_document", number=100000))