uawdijnntqw1x1x1
IP : 216.73.216.155
Hostname : vm5018.vps.agava.net
Kernel : Linux vm5018.vps.agava.net 3.10.0-1127.8.2.vz7.151.14 #1 SMP Tue Jun 9 12:58:54 MSK 2020 x86_64
Disable Function : None :)
OS : Linux
PATH:
/
var
/
www
/
iplanru
/
data
/
www
/
www.i-plan.ru
/
libraries
/
phpmorphy
/
bin
/
.
/
extract_ancodes_map.php
/
/
#!/usr/bin/php <?php if(2 == (ini_get('mbstring.func_overload') & 2)) { die("don`t overload string functions in mbstring extension, see mbstring.func_overload option"); } if($argc < 3) { echo "Usage " . $argv[0] . " MORPH_DATA_FILE LANGUAGE OUT_DIR"; exit; } require_once(dirname(__FILE__) . '/../src/common.php'); require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/gramtab.php'); require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/rml.php'); require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/mwz.php'); $graminfo_file = $argv[1]; $language = $argv[2]; $out_dir = $argv[3]; try { $factory = new phpMorphy_Storage_Factory(); $graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $graminfo_file, false), false); $out_file = $out_dir . '/morph_data_ancodes_map.' . strtolower($graminfo->getLocale()) . '.bin'; $gramtab_map = get_gramtab_map($language); $valid_ancodes = array_flip(array_values($gramtab_map)); $ancodes_map = array(); foreach(get_all_ancodes($graminfo) as $id => $value) { if(isset($gramtab_map[$value])) { $orig_ancode = $gramtab_map[$value]; $ancodes_map[$id] = $orig_ancode; } else { // TODO: typically ancodes don`t contain digits, so we can generate mapping to char + digit ancodes do { $new_ancode = chr(mt_rand(ord('a'), ord('z'))) . chr(mt_rand(ord('a'), ord('z'))); } while(isset($valid_ancodes[$new_ancode])); echo "'$value' not found in gramtab, assume $new_ancode" . PHP_EOL; $ancodes_map[$id] = $new_ancode; } } foreach($ancodes_map as &$ancode) { $ancode = iconv('utf-8', $graminfo->getEncoding(), $ancode); unset($ancode); // remove reference from array } unset($ancode); file_put_contents($out_file, serialize($ancodes_map)); } catch (Exception $e) { echo $e; exit(1); } function get_all_ancodes($graminfo) { $grammems = array(); $poses = array(); foreach($graminfo->readAllPartOfSpeech() as $id => $pos) { $poses[$id] = $pos['name']; } foreach($graminfo->readAllGrammems() as $id => $grammem) { $grammems[$id] = $grammem['name']; } $result = array(); foreach($graminfo->readAllAncodes() as $id => $ancode) { if(!isset($poses[$ancode['pos_id']])) { throw new Exception("Unknown pos id '" . $ancode['pos_id'] . "'"); } $pos = iconv($graminfo->getEncoding(), 'utf-8', $poses[$ancode['pos_id']]); $gram = array(); foreach($ancode['grammem_ids'] as $grammem) { if(!isset($grammems[$grammem])) { throw new Exception("Unknown grammem id '$grammem'"); } $gram[] = iconv($graminfo->getEncoding(), 'utf-8', $grammems[$grammem]); } sort($gram); $result[$id] = mb_strtoupper($pos . ' ' . implode(',', $gram)); } return $result; } function get_gramtab_map($language) { $rml = new phpMorphy_Rml_IniFile(); $gramtab_file = $rml->getGramTabPath($language); $gramtab = new phpMorphy_GramTab_File( $gramtab_file, phpMorphy_Mwz_File::getEncodingForLang($language), new phpMorphy_GramTab_GramInfoFactory($language) ); $gramtab_map = array(); foreach($gramtab as $ancode => $obj) { $grammems = $obj->getGrammems(); sort($grammems); $key = $obj->getPartOfSpeech() . ' ' . implode(',', $grammems); if(isset($gramtab_map[$key])) { throw new Exception("Duplicate ancode contents for $ancode => $key"); } $key = mb_strtoupper($key, 'utf-8'); $gramtab_map[$key] = $ancode; } return $gramtab_map; }
/var/www/iplanru/data/www/www.i-plan.ru/libraries/phpmorphy/bin/./extract_ancodes_map.php