I have files in Biblical Hebrew with older Unicodes that I want to upgrade to the new special unicodes. I'm working in Python.
For a specific example, in Unicode the following concatenated Unicodes -- '\u05d9' + '\u05b4' -- give bet with dagesh בּ. However, there is a new Unicode -- '\ufb31' -- which specifically gives bet with dagesh only in a better placement.
I'm trying to loop through a file and every time I find a deprecated version of a Hebrew character to then replace it with the updated character. I tried using a dictionary, but this does not work with str.replace(). The following is the code:
hebrew_unicodes_dict = {
'\u05d9' + '\u05b4': '\ufb1d', ## yod with hiriq
'\u05E9' + '\u05c1': '\ufb2a', ## shin with shin dot
'\u05E9' + '\u05c2': '\ufb2b', ## shin with sin dot
'\ufb49' + '\u05c1': '\ufb2c', ## shin with dagesh and shin dot
'\ufb49' + '\u05c2': '\ufb2d', ## shin with dagesh and sin dot
'\u05d0' + '\u05b7': '\ufb2e', ## alef with patach
'\u05d0' + '\u05b8': '\ufb2f', ## alef with qamats
'\u05d0' + '\u05bc': '\ufb30', ## alef with mappiq
'\u05d1' + '\u05bc': '\ufb31', ## bet with dagesh
'\u05d2' + '\u05bc': '\ufb32', ## gimel with dagesh
'\u05d3' + '\u05bc': '\ufb33', ## dalet with dagesh
'\u05d4' + '\u05bc': '\ufb34', ## he with mappiq
'\u05d5' + '\u05bc': '\ufb35', ## vav with dagesh (shuruq)
'\u05d6' + '\u05bc': '\ufb36', ## zayin with dagesh
'\u05d8' + '\u05bc': '\ufb38', ## tet with dagesh
'\u05d9' + '\u05bc': '\ufb39', ## yod with dagesh
'\u05da' + '\u05bc': '\ufb3a', ## final kaf with dagesh
'\u05db' + '\u05bc': '\ufb3b', ## kaf with dagesh
'\u05dc' + '\u05bc': '\ufb3c', ## lamed with dagesh
'\u05de' + '\u05bc': '\ufb3e', ## mem with dagesh
'\u05e0' + '\u05bc': '\ufb40', ## nun with dagesh
'\u05e1' + '\u05bc': '\ufb41', ## samekh with dagesh
'\u05e3' + '\u05bc': '\ufb43', ## final pe with dagesh
'\u05e6' + '\u05bc': '\ufb46', ## tsadi with dagesh
'\u05e7' + '\u05bc': '\ufb47', ## qof with dagesh
'\u05e8' + '\u05bc': '\ufb48', ## resh with dagesh
'\u05e9' + '\u05bc': '\ufb49', ## shin with dagesh
'\u05ea' + '\u05bc': '\ufb4a', ## tav with dagesh
'\u05d5' + '\u05b9': '\ufb4b', ## holam vav
'\u05d1' + '\u05bf': '\ufb4c', ## vet with rafe
'\u05db' + '\u05bf': '\ufb4d', ## khaf with rafe
'\u05e4' + '\u05bf': '\ufb4e', ## fe with rafe
'\u05d0' + '\u05dc': '\ufb4f' ## alef lamed ligature
}
print('length of Hebrew_unicodes_dict is ' + str(len(hebrew_unicodes_dict)))
## print('printing hebrew_unicodes_dict')
## print(hebrew_unicodes_dict)
### This script replaces deprecated unicodes for Hebrew letters with the more updated unicodes.
torah_file = open ('/users/alberthembd/Desktop/Ginsburg_Source/01_Genesis.txt', 'r', encoding = 'utf-8')
revised_torah_file = open('/Users/alberthembd/Documents/PythonDocuments/01_Genesis_fixed.txt', 'w')
with torah_file, revised_torah_file:
for line in torah_file:
for old_value, new_value in (hebrew_unicodes_dict.items()):
old_line = line
print('line before' + line)
line.replace(old_value, new_value)
new_line = line
print('new_line == ' + new_line)
print('new line == old_line ' + new_line == old_line)
revised_torah_file.write(line)