Here's my solution for all code points denoted as U+0000
through U+10FFFF
("U+" followed by the code point value in hexadecimal, which is prepended with leading zeros to a minimum of four digits):
import re
def UniToChar(unicode_notation):
return chr(int(re.findall(r'<U\+([a-hA-H0-9]{4,5})>',unicode_notation)[0],16))
xx= '''
At Donald<U+2019>s <U+2016>Elect<U+2016> in <U+2017>2019<U+2017>
<U+00C0> la Donald<U+2019>s friend <U+1F986>. <U+1F929><U+1F92A><U+1F601>
'''
for x in xx.split('\n'):
abc = re.findall(r'<U\+[a-hA-H0-9]{4,5}>',x)
if len(abc) > 0:
for uniid in set(abc): x=x.replace(uniid, UniToChar(uniid))
print(repr(x).strip("'"))
Output: 71307293.py
At Donald’s ‖Elect‖ in ‗2019‗
À la Donald’s friend .
In fact, private range from U+100000
to U+10FFFD
(Plane 16) isn't detected using above simplified regex… Improved code follows:
import re
def UniToChar(unicode_notation):
aux = int(re.findall(r'<U\+([a-hA-H0-9]{4,6})>',unicode_notation)[0],16)
# circumvent the "ValueError: chr() arg not in range(0x110000)"
if aux <= 0x10FFFD:
return chr(aux)
else:
return chr(0xFFFD) # Replacement Character
xx= '''
At Donald<U+2019>s <U+2016>Elect<U+2016> in <U+2017>2019<U+2017>
<U+00C0> la Donald<U+2019>s friend <U+1F986>. <U+1F929><U+1F92A><U+1F601>
Unassigned: <U+05ff>; out of Unicode range: <U+110000>.
'''
for x in xx.split('\n'):
abc = re.findall(r'<U\+[a-hA-H0-9]{4,6}>',x)
if len(abc) > 0:
for uniid in set(abc): x=x.replace(uniid, UniToChar(uniid))
print(repr(x).strip("'"))
Output: 71307293.py
At Donald’s ‖Elect‖ in ‗2019‗
À la Donald’s friend .
Unassigned: \u05ff; out of Unicode range: �.