Follow WindowsBestFit/readme.txt; in particular, description of multibyte mapping records in the WCTABLE
section (the WCTABLE
tag marks the start of the Unicode UTF-16 (WideChar) to "MultiByte" bytes…).
The following, partially commented, Python 3 script (sorry, I don't speak VBA):
- reads (previously downloaded)
bestfit936.txt
file line by line,
- parses its
WCTABLE
section and builds an array of Unicode codepoints which gb2312
codepoint (codepage 936) matches rules of Simplified Chinese Identifiers and which Unicode Category is letter ('Ll','Lu','Lo'
) (see variable init_chars_16
),
- sorts codepoints in variable
init_chars_16
and creates corresponding array of characters (variable init_chars_utf16
),
- groups codepoints in variable
init_chars_16
to longest consecutive chains (variable init_chars_groups
), and
- prints a few characters, their codepoints and corresponding consecutive ranges (supply argument
1
to print all characters). There is 15477 applicable codepoints in (unfortunately) 1977 consecutive ranges.
This is done only for CP936-initial-character
however the same could be applied for CP936-subsequent-character
as well (supply argument 2
, see also Usage below and output examples).
from itertools import groupby
from operator import itemgetter
import unicodedata
import sys
if ( len(sys.argv) > 1 and [1,2,'1','2'].__contains__( sys.argv[1]) ):
init_chars_test = int(sys.argv[1])
demo_chars_test = False
else:
init_chars_test = 1
demo_chars_test = True
def first_last( some_list ):
if len( some_list ) > 1:
return '..'.join( [ '0x{:04x}'.format(some_list[0]),
'0x{:04x}'.format(some_list[-1]) ] )
else:
return '0x{:04x}'.format(some_list[0])
if init_chars_test == 1:
unicode_category = ['Ll','Lu','Lo'] # letters
print( str('DEMO ' if demo_chars_test else '')
+ 'CP936-initial-character:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3C1,0xA3DA +1)) +
list( range( 0xA3E1,0xA3FA +1)) +
list( range( 0xA1A2,0xA1AA +1)) +
list( range( 0xA1AC,0xA1AD +1)) +
list( range( 0xA1B2,0xA1E6 +1)) +
list( range( 0xA1E8,0xA1EF +1)) +
list( range( 0xA2B1,0xA2FC +1)) +
list( range( 0xA4A1,0xFE4F +1)) )
else:
unicode_category=['Ll','Lu','Lo','Nd','Pc'] # letters or numbers or underscore
print( 'CP936-subsequent-character addendum:', unicode_category, '\n')
init_chars_CP936 = sorted(
list( range( 0xA3DF,0xA3DF +1)) +
list( range( 0xA3B0,0xA3B9 +1)) )
wctable = False
init_chars_16 = [] # Unicode UTF-16 codepoints (as integers)
init_chars_undef = [] #
i = 0
with open(r'D:\Downloads\Unicode\bestfit936.txt',
mode='r', encoding='gb2312', errors='backslashreplace') as ff:
for rawline in ff:
i+=1
line = rawline.split('\t')
if line[0].upper() in [
'CODEPAGE','CPINFO','MBTABLE','DBCSRANGE','DBSCTABLE','ENDCODEPAGE']:
wctable = False
if wctable:
if len(line) >1:
code_936 = int(line[1],16) if line[1].lower().startswith('0x') else 0
if code_936 in init_chars_CP936:
code16be = int(line[0],16) if line[0].lower().startswith('0x') else 0
# µ vs. μ error
# 0x00b5 0xa6cc ;μ # at line 24608: 'Micro Sign'
# 0x03bc 0xa6cc ;μ # at line 24718: 'Greek Small Letter Mu'
if ( code16be > 0x00ff # exclude 0x00b5 and permit only letters
and ( unicodedata.category( chr(code16be)) in unicode_category )
# and len( chr(code16be).encode('gb2312','ignore'))> 0
):
# if len(unicodedata.normalize('NFKD',chr(code16be))) == 1:
init_chars_16.append( code16be)
else:
init_chars_undef.append(line[1])
else:
# The WCTABLE tag marks the start of the Unicode UTF-16 (WideChar) to "MultiByte" bytes
wctable = rawline.startswith( 'WCTABLE')
# for debugging purposes if wctable: print(i, rawline)
init_chars_16 = sorted( set( init_chars_16 ) )
init_chars_utf16 = [ chr(x) for x in init_chars_16 ]
init_chars_groups = [] # groups of consecutive code points
# # https://stackoverflow.com/questions/2154249/
for k, g in groupby( enumerate(init_chars_16), lambda ix : ix[0] - ix[1]):
init_chars_groups.append( first_last(list(map(itemgetter(1), g))))
def finalprint(od, do, odg, dog, sep):
global init_chars_utf16, init_chars_16, init_chars_groups
print( ''.join( init_chars_utf16[od:do] )+'\n') # characters
print( ', '.join( '0x{:04x}'.format(x)
for x in init_chars_16[od:do] )+'\n') # their Unicode codepoints
print( sep.join( x
for x in init_chars_groups[odg:dog])+'\n' ) # groups of Unicode codepoints
print( len(init_chars_groups), ':', odg, dog ) # total number of groups and displayed range
if init_chars_test == 1 and demo_chars_test:
finalprint(354, 380, 23, 30, ' ')
else:
finalprint(0, None, 0, None, '|')
print( init_chars_test, len(init_chars_CP936), len(init_chars_undef),
len(init_chars_utf16), len(init_chars_groups) ) # resume
if demo_chars_test:
print( '\nUsage:\n\t%s [ 1 | 2 ]\n' % sys.argv[0])
print( 'Examples:\n\t%s // prints CP936-initial-character DEMO' % sys.argv[0])
print( '\n\t%s 1 // prints CP936-initial-character' % sys.argv[0])
print( '\n\t%s 2 // prints CP936-subsequent-character addendum' % sys.argv[0])
Output: .\SO\68766804.py
DEMO CP936-initial-character: ['Ll', 'Lu', 'Lo']
一丁七万丈三上下丌不与丐丑专且丕世丘丙业丛东丝丞丢两
0x4e00, 0x4e01, 0x4e03, 0x4e07, 0x4e08, 0x4e09, 0x4e0a, 0x4e0b, 0x4e0c, 0x4e0d, 0x4e0e, 0x4e10, 0x4e11, 0x4e13, 0x4e14, 0x4e15, 0x4e16, 0x4e18, 0x4e19, 0x4e1a, 0x4e1b, 0x4e1c, 0x4e1d, 0x4e1e, 0x4e22, 0x4e24
0x4e00..0x4e01 0x4e03 0x4e07..0x4e0e 0x4e10..0x4e11 0x4e13..0x4e16 0x4e18..0x4e1e 0x4e22
1977 : 23 30
1 23159 2085 15477 1977
Usage:
D:\bat\SO\68766804.py [ 1 | 2 ]
Examples:
D:\bat\SO\68766804.py // prints CP936-initial-character DEMO
D:\bat\SO\68766804.py 1 // prints CP936-initial-character
D:\bat\SO\68766804.py 2 // prints CP936-subsequent-character addendum
Output: .\SO\68766804.py 2
CP936-subsequent-character addendum: ['Ll', 'Lu', 'Lo', 'Nd', 'Pc']
0123456789_
0xff10, 0xff11, 0xff12, 0xff13, 0xff14, 0xff15, 0xff16, 0xff17, 0xff18, 0xff19, 0xff3f
0xff10..0xff19|0xff3f
2 : 0 None
2 11 0 11 2