I want to extract dates from OCR images using the dateparser
lib.
import dateparser
data = []
listOfPages = glob.glob(r"C:/Users/name/folder/test/*.tif")
for entry in listOfPages:
text1 = pytesseract.image_to_string(
Image.open(entry), lang="deu"
)
text = re.sub(r'\n',' ', text1)
date1 = re.compile(r'(Dresden(\.|,|\s+)?)(.*)', flags = re.DOTALL | re.MULTILINE)
date = date1.search(text)
if date:
dates = dateparser.parse(date.group(3), date_formats=['%d %m %Y'], languages=['de'], settings={'STRICT_PARSING': True})
else:
dates = None
if dates == None:
dates = dateparser.parse(date.group(3), date_formats=['%d %B %Y'], locale = 'de', settings={'STRICT_PARSING': True})
else:
dates = None
data.append([text, dates])
df0 = pd.DataFrame(data, columns =['raw_text', 'dates'])
print(df0)
Why am i getting error: NameError: name 'dates' is not defined
update: TypeError: Input type must be str