I read before documentation and I wrote hundreds of regular expression but I have not idea how to detect sequence of unicode letter.
# this will detect sequence of English letters
re.compile(r'[a-zA-Z]+')
# this will detect sequence of Unicode letters + [0-9_]
re.compile(r'\w+', re.UNICODE)
# how to detect sequence only unicode letter (without [0-9_])
re.compile(r'????', re.UNICODE)
How to match only unicode characters without [0-9_]?
I tested your solutions:
import re
import timeit
def test1():
regex = re.compile(ur'(?:(?![\d_])\w)+', re.UNICODE)
return regex.findall(u'Ala ma kota z czarną sierścią - 1halo - halo1.')
def test2():
regex = re.compile(ur'[^\W\d_]+', re.UNICODE)
return regex.findall(u'Ala ma kota z czarną sierścią - 1halo - halo1.')
print test1()
print test2()
print timeit.timeit(test1)
print timeit.timeit(test2)
and times are:
[u'Ala', u'ma', u'kota', u'z', u'czarn\u0105', u'sier\u015bci\u0105', u'halo', u'halo']
[u'Ala', u'ma', u'kota', u'z', u'czarn\u0105', u'sier\u015bci\u0105', u'halo', u'halo']
11.0143377108
7.42619199741