Im trying to perform regular match on data that came from excel to python array using openpyxl but the data came as unicode and "None" is allways given by python. The data in Hebrew and i whant to convert the strings from excel to strings that can be matched using regex.. what can be done?
import re
from openpyxl import load_workbook
file_name = 'excel.xlsx'
wb = load_workbook(file_name)
ws = wb[u'beta']
li = []
li2 = []
#readin the cells from excel into an array
for i in range(1,1500):
li2.append(ws["A"+str(i)].value)
for i in li2:
if i != None:
li.append(i)
#deliting the unwanted list for making memory
del li2
r = re.match("א",li[1])
r == None
>>> True
the wanted resault is r.string = "somthing..." and not r == None
Python 2.7.9 (default, Dec 10 2014, 12:24:55) [MSC v.1500 32 bit (Intel)] on win32
Type "copyright", "credits" or "license()" for more information.
>>> ================================ RESTART ================================
>>>
>>> li[1]
u"\u05d0\u05d1\u05d5 \u05d2'\u05d5\u05d5\u05d9\u05d9\u05e2\u05d3 (\u05e9\u05d1\u05d8)"
>>> print li[1]
אבו ג'ווייעד (שבט)
>>> r = re.match(u'א',li[1])
>>> r ==None
True
>>> r = re.match(ur'א',li[1])
>>> r = re.match(u'',li[1])
>>> r.string
u"\u05d0\u05d1\u05d5 \u05d2'\u05d5\u05d5\u05d9\u05d9\u05e2\u05d3 (\u05e9\u05d1\u05d8)"
>>> unicode('א')
Traceback (most recent call last):
File "<pyshell#7>", line 1, in <module>
unicode('א')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe0 in position 0: ordinal not in range(128)
>>> u'א'
u'\xe0'
>>> u'א'.encode("utf8")
'\xc3\xa0'
>>> u"א"
u'\xe0'
>>>