The variables on the converter are not usable outside of the class, so making the public ones also private. Change-Id: I6087dcdac761916359009548beb5210671a32ee5
452 lines
14 KiB
Python
Executable file
452 lines
14 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# @author Philip
|
|
import os
|
|
import platform
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import tarfile
|
|
import zipfile
|
|
|
|
pyversion = platform.python_version()
|
|
islinux = platform.system().lower() == 'linux'
|
|
|
|
if pyversion[:3] in ['2.6', '2.7']:
|
|
import urllib as urllib_request
|
|
import codecs
|
|
open = codecs.open
|
|
_unichr = unichr
|
|
if sys.maxunicode < 0x10000:
|
|
def unichr(i):
|
|
if i < 0x10000:
|
|
return _unichr(i)
|
|
else:
|
|
return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))
|
|
elif pyversion[:2] == '3.':
|
|
import urllib.request as urllib_request
|
|
unichr = chr
|
|
|
|
|
|
def unichr2(*args):
|
|
return [unichr(int(i.split('<')[0][2:], 16)) for i in args]
|
|
|
|
|
|
def unichr3(*args):
|
|
return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]
|
|
|
|
# DEFINE
|
|
UNIHAN_VER = '12.1.0'
|
|
SF_MIRROR = 'master'
|
|
SCIM_TABLES_VER = '0.5.14.1'
|
|
SCIM_PINYIN_VER = '0.5.92'
|
|
LIBTABE_VER = '0.2.3'
|
|
# END OF DEFINE
|
|
|
|
|
|
def download(url, dest):
|
|
if os.path.isfile(dest):
|
|
print('File %s is up to date.' % dest)
|
|
return
|
|
global islinux
|
|
if islinux:
|
|
# we use wget instead urlretrieve under Linux,
|
|
# because wget could display details like download progress
|
|
os.system('wget %s -O %s' % (url, dest))
|
|
else:
|
|
print('Downloading from [%s] ...' % url)
|
|
urllib_request.urlretrieve(url, dest)
|
|
print('Download complete.\n')
|
|
return
|
|
|
|
|
|
def uncompress(fp, member, encoding='U8'):
|
|
name = member.rsplit('/', 1)[-1]
|
|
print('Extracting %s ...' % name)
|
|
fp.extract(member)
|
|
shutil.move(member, name)
|
|
if '/' in member:
|
|
shutil.rmtree(member.split('/', 1)[0])
|
|
if pyversion[:1] in ['2']:
|
|
fc = open(name, 'rb', encoding, 'ignore')
|
|
else:
|
|
fc = open(name, 'r', encoding=encoding, errors='ignore')
|
|
return fc
|
|
|
|
unzip = lambda path, member, encoding = 'U8': \
|
|
uncompress(zipfile.ZipFile(path), member, encoding)
|
|
|
|
untargz = lambda path, member, encoding = 'U8': \
|
|
uncompress(tarfile.open(path, 'r:gz'), member, encoding)
|
|
|
|
|
|
def parserCore(fp, pos, beginmark=None, endmark=None):
|
|
if beginmark and endmark:
|
|
start = False
|
|
else:
|
|
start = True
|
|
mlist = set()
|
|
for line in fp:
|
|
if beginmark and line.startswith(beginmark):
|
|
start = True
|
|
continue
|
|
elif endmark and line.startswith(endmark):
|
|
break
|
|
if start and not line.startswith('#'):
|
|
elems = line.split()
|
|
if len(elems) < 2:
|
|
continue
|
|
elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only
|
|
mlist.add(elems[pos])
|
|
return mlist
|
|
|
|
|
|
def tablesParser(path, name):
|
|
""" Read file from scim-tables and parse it. """
|
|
global SCIM_TABLES_VER
|
|
src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
|
|
fp = untargz(path, src, 'U8')
|
|
return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')
|
|
|
|
ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')
|
|
wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')
|
|
zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')
|
|
|
|
|
|
def phraseParser(path):
|
|
""" Read phrase_lib.txt and parse it. """
|
|
global SCIM_PINYIN_VER
|
|
src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
|
|
fp = untargz(path, src, 'U8')
|
|
return parserCore(fp, 0)
|
|
|
|
|
|
def tsiParser(path):
|
|
""" Read tsi.src and parse it. """
|
|
src = 'libtabe/tsi-src/tsi.src'
|
|
fp = untargz(path, src, 'big5hkscs')
|
|
return parserCore(fp, 0)
|
|
|
|
|
|
def unihanParser(path):
|
|
""" Read Unihan_Variants.txt and parse it. """
|
|
fp = unzip(path, 'Unihan_Variants.txt', 'U8')
|
|
t2s = dict()
|
|
s2t = dict()
|
|
for line in fp:
|
|
if line.startswith('#'):
|
|
continue
|
|
else:
|
|
elems = line.split()
|
|
if len(elems) < 3:
|
|
continue
|
|
type = elems.pop(1)
|
|
elems = unichr2(*elems)
|
|
if type == 'kTraditionalVariant':
|
|
s2t[elems[0]] = elems[1:]
|
|
elif type == 'kSimplifiedVariant':
|
|
t2s[elems[0]] = elems[1:]
|
|
fp.close()
|
|
return (t2s, s2t)
|
|
|
|
|
|
def applyExcludes(mlist, path):
|
|
""" Apply exclude rules from path to mlist. """
|
|
if pyversion[:1] in ['2']:
|
|
excludes = open(path, 'rb', 'U8').read().split()
|
|
else:
|
|
excludes = open(path, 'r', encoding='U8').read().split()
|
|
excludes = [word.split('#')[0].strip() for word in excludes]
|
|
excludes = '|'.join(excludes)
|
|
excptn = re.compile('.*(?:%s).*' % excludes)
|
|
diff = [mword for mword in mlist if excptn.search(mword)]
|
|
mlist.difference_update(diff)
|
|
return mlist
|
|
|
|
|
|
def charManualTable(path):
|
|
fp = open(path, 'r', encoding='U8')
|
|
for line in fp:
|
|
elems = line.split('#')[0].split('|')
|
|
elems = unichr3(*elems)
|
|
if len(elems) > 1:
|
|
yield elems[0], elems[1:]
|
|
|
|
|
|
def toManyRules(src_table):
|
|
tomany = set()
|
|
if pyversion[:1] in ['2']:
|
|
for (f, t) in src_table.iteritems():
|
|
for i in range(1, len(t)):
|
|
tomany.add(t[i])
|
|
else:
|
|
for (f, t) in src_table.items():
|
|
for i in range(1, len(t)):
|
|
tomany.add(t[i])
|
|
return tomany
|
|
|
|
|
|
def removeRules(path, table):
|
|
fp = open(path, 'r', encoding='U8')
|
|
texc = list()
|
|
for line in fp:
|
|
elems = line.split('=>')
|
|
f = t = elems[0].strip()
|
|
if len(elems) == 2:
|
|
t = elems[1].strip()
|
|
f = f.strip('"').strip("'")
|
|
t = t.strip('"').strip("'")
|
|
if f:
|
|
try:
|
|
table.pop(f)
|
|
except:
|
|
pass
|
|
if t:
|
|
texc.append(t)
|
|
texcptn = re.compile('^(?:%s)$' % '|'.join(texc))
|
|
if pyversion[:1] in ['2']:
|
|
for (tmp_f, tmp_t) in table.copy().iteritems():
|
|
if texcptn.match(tmp_t):
|
|
table.pop(tmp_f)
|
|
else:
|
|
for (tmp_f, tmp_t) in table.copy().items():
|
|
if texcptn.match(tmp_t):
|
|
table.pop(tmp_f)
|
|
return table
|
|
|
|
|
|
def customRules(path):
|
|
fp = open(path, 'r', encoding='U8')
|
|
ret = dict()
|
|
for line in fp:
|
|
line = line.rstrip('\r\n')
|
|
if '#' in line:
|
|
line = line.split('#')[0].rstrip()
|
|
elems = line.split('\t')
|
|
if len(elems) > 1:
|
|
ret[elems[0]] = elems[1]
|
|
return ret
|
|
|
|
|
|
def dictToSortedList(src_table, pos):
|
|
return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))
|
|
|
|
|
|
def translate(text, conv_table):
|
|
i = 0
|
|
while i < len(text):
|
|
for j in range(len(text) - i, 0, -1):
|
|
f = text[i:][:j]
|
|
t = conv_table.get(f)
|
|
if t:
|
|
text = text[:i] + t + text[i:][j:]
|
|
i += len(t) - 1
|
|
break
|
|
i += 1
|
|
return text
|
|
|
|
|
|
def manualWordsTable(path, conv_table, reconv_table):
|
|
fp = open(path, 'r', encoding='U8')
|
|
reconv_table = reconv_table.copy()
|
|
out_table = {}
|
|
wordlist = [line.split('#')[0].strip() for line in fp]
|
|
wordlist = list(set(wordlist))
|
|
wordlist.sort(key=lambda w: (len(w), w), reverse=True)
|
|
while wordlist:
|
|
word = wordlist.pop()
|
|
new_word = translate(word, conv_table)
|
|
rcv_word = translate(word, reconv_table)
|
|
if word != rcv_word:
|
|
reconv_table[word] = out_table[word] = word
|
|
reconv_table[new_word] = out_table[new_word] = word
|
|
return out_table
|
|
|
|
|
|
def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,
|
|
char_reconv_table):
|
|
wordlist = list(src_wordlist)
|
|
wordlist.sort(key=lambda w: (len(w), w), reverse=True)
|
|
word_conv_table = {}
|
|
word_reconv_table = {}
|
|
conv_table = char_conv_table.copy()
|
|
reconv_table = char_reconv_table.copy()
|
|
tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))
|
|
while wordlist:
|
|
conv_table.update(word_conv_table)
|
|
reconv_table.update(word_reconv_table)
|
|
word = wordlist.pop()
|
|
new_word_len = word_len = len(word)
|
|
while new_word_len == word_len:
|
|
test_word = translate(word, reconv_table)
|
|
new_word = translate(word, conv_table)
|
|
if not reconv_table.get(new_word) and \
|
|
(test_word != word or
|
|
(tomanyptn.search(word) and
|
|
word != translate(new_word, reconv_table))):
|
|
word_conv_table[word] = new_word
|
|
word_reconv_table[new_word] = word
|
|
try:
|
|
word = wordlist.pop()
|
|
except IndexError:
|
|
break
|
|
new_word_len = len(word)
|
|
return word_reconv_table
|
|
|
|
|
|
def PHPArray(table):
|
|
lines = ['\t\t\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
|
|
return '\n'.join(lines)
|
|
|
|
|
|
def main():
|
|
# Get Unihan.zip:
|
|
url = 'https://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
|
|
han_dest = 'Unihan-%s.zip' % UNIHAN_VER
|
|
download(url, han_dest)
|
|
|
|
sfurlbase = 'https://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
|
|
|
|
# Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
|
|
url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
|
|
tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
|
|
download(url, tbe_dest)
|
|
|
|
# Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
|
|
url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
|
|
pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
|
|
download(url, pyn_dest)
|
|
|
|
# Get libtabe-$(LIBTABE_VER).tgz:
|
|
url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
|
|
lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
|
|
download(url, lbt_dest)
|
|
|
|
# Unihan.txt
|
|
(t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)
|
|
|
|
t2s_1tomany.update(charManualTable('symme_supp.manual'))
|
|
t2s_1tomany.update(charManualTable('trad2simp.manual'))
|
|
s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))
|
|
s2t_1tomany.update(charManualTable('simp2trad.manual'))
|
|
|
|
if pyversion[:1] in ['2']:
|
|
t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])
|
|
s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])
|
|
else:
|
|
t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])
|
|
s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])
|
|
|
|
s_tomany = toManyRules(t2s_1tomany)
|
|
t_tomany = toManyRules(s2t_1tomany)
|
|
|
|
# noconvert rules
|
|
t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)
|
|
s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)
|
|
|
|
# the supper set for word to word conversion
|
|
t2s_1to1_supp = t2s_1to1.copy()
|
|
s2t_1to1_supp = s2t_1to1.copy()
|
|
t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))
|
|
s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))
|
|
|
|
# word to word manual rules
|
|
t2s_word2word_manual = manualWordsTable('simpphrases.manual',
|
|
s2t_1to1_supp, t2s_1to1_supp)
|
|
t2s_word2word_manual.update(customRules('toSimp.manual'))
|
|
s2t_word2word_manual = manualWordsTable('tradphrases.manual',
|
|
t2s_1to1_supp, s2t_1to1_supp)
|
|
s2t_word2word_manual.update(customRules('toTrad.manual'))
|
|
|
|
# word to word rules from input methods
|
|
t_wordlist = set()
|
|
s_wordlist = set()
|
|
t_wordlist.update(ezbigParser(tbe_dest),
|
|
tsiParser(lbt_dest))
|
|
s_wordlist.update(wubiParser(tbe_dest),
|
|
zrmParser(tbe_dest),
|
|
phraseParser(pyn_dest))
|
|
|
|
# exclude
|
|
s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')
|
|
t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')
|
|
|
|
s2t_supp = s2t_1to1_supp.copy()
|
|
s2t_supp.update(s2t_word2word_manual)
|
|
t2s_supp = t2s_1to1_supp.copy()
|
|
t2s_supp.update(t2s_word2word_manual)
|
|
|
|
# parse list to dict
|
|
t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,
|
|
s2t_1to1_supp, t2s_supp)
|
|
t2s_word2word.update(t2s_word2word_manual)
|
|
s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,
|
|
t2s_1to1_supp, s2t_supp)
|
|
s2t_word2word.update(s2t_word2word_manual)
|
|
|
|
# Final tables
|
|
# sorted list toHans
|
|
if pyversion[:1] in ['2']:
|
|
t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])
|
|
else:
|
|
t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])
|
|
toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)
|
|
# sorted list toHant
|
|
if pyversion[:1] in ['2']:
|
|
s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])
|
|
else:
|
|
s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])
|
|
toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)
|
|
# sorted list toCN
|
|
toCN = dictToSortedList(customRules('toCN.manual'), 1)
|
|
# sorted list toHK
|
|
toHK = dictToSortedList(customRules('toHK.manual'), 1)
|
|
# sorted list toTW
|
|
toTW = dictToSortedList(customRules('toTW.manual'), 1)
|
|
|
|
# Get PHP Array
|
|
php = '''<?php
|
|
/**
|
|
* Simplified / Traditional Chinese conversion tables
|
|
*
|
|
* Automatically generated using code and data in maintenance/language/zhtable/
|
|
* Do not modify directly!
|
|
*
|
|
* @file
|
|
*/
|
|
|
|
namespace MediaWiki\Languages\Data;
|
|
|
|
class ZhConversion {
|
|
public const ZH_TO_HANT = [\n'''
|
|
php += PHPArray(toHant) \
|
|
+ '\n\t];\n\n\tpublic const ZH_TO_HANS = [\n' \
|
|
+ PHPArray(toHans) \
|
|
+ '\n\t];\n\n\tpublic const ZH_TO_TW = [\n' \
|
|
+ PHPArray(toTW) \
|
|
+ '\n\t];\n\n\tpublic const ZH_TO_HK = [\n' \
|
|
+ PHPArray(toHK) \
|
|
+ '\n\t];\n\n\tpublic const ZH_TO_CN = [\n' \
|
|
+ PHPArray(toCN) \
|
|
+ '\n\t];\n}\n'
|
|
|
|
if pyversion[:1] in ['2']:
|
|
f = open(os.path.join('..', '..', '..', 'includes', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding='utf8')
|
|
else:
|
|
f = open(os.path.join('..', '..', '..', 'includes', 'languages', 'data', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')
|
|
print ('Writing ZhConversion.php ... ')
|
|
f.write(php)
|
|
f.close()
|
|
|
|
# Remove temporary files
|
|
print ('Deleting temporary files ... ')
|
|
os.remove('EZ-Big.txt.in')
|
|
os.remove('phrase_lib.txt')
|
|
os.remove('tsi.src')
|
|
os.remove('Unihan_Variants.txt')
|
|
os.remove('Wubi.txt.in')
|
|
os.remove('Ziranma.txt.in')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|