Файловый менеджер - Редактировать - /var/www/iplanru/data/www/i-plan.ru/libraries/phpmorphy/utils/dict_stuff/hunspell/reader.php
Назад
<?php require(dirname(__FILE__) . '/../../libs/iterators.php'); // Requires mb extension, assumes internal_encoding as UTF-8 !!! // This limited implementation, many features not implemented (such as affix aliases and etc) class phpMorphy_Hunspell_Exception extends Exception { } abstract class phpMorphy_Hunspell_Affix { protected $remove_len, $remove, $append, $find, $find_len, $morph, $reg, $is_simple, $is_empty ; function __construct($find, $remove, $append, $morph = null) { $this->remove_len = mb_strlen((string)$remove); $this->remove = $remove; $this->append = $append; $this->morph = $morph; $this->find = $find; $this->find_len = mb_strlen($find); $this->is_simple = $this->isSimple($find); $this->is_empty = $this->isEmpty($find); $this->reg = $this->getRegExp($find); } function getRemoveLength() { return $this->remove_len; } function isMorphDescription() { return isset($this->morph); } function getMorphDescription() { return $this->morph; } function isMatch($word) { if($this->is_empty) { return true; } if($this->is_simple) { return $this->simpleMatch($word); } else { //return false; return preg_match($this->reg, $word) > 0; //return mb_ereg_match($this->reg, $word); } } protected function isSimple($find) { return strpos($find, '[') === false && strpos($find, '.') === false; } protected function isEmpty($find) { return $find === '.'; } abstract function generateWord($word); abstract protected function simpleMatch($word); abstract protected function getRegExp($find); } class phpMorphy_Hunspell_Prefix extends phpMorphy_Hunspell_Affix { protected function getRegExp($find) { return "~^{$find}~iu"; } function generateWord($word) { if(!$this->isMatch($word)) { return false; } if($this->remove_len && mb_strlen($word) >= $this->remove_len) { $word = mb_substr($word, $this->remove_len); } return "{$this->append}$word"; } protected function simpleMatch($word) { return mb_substr($word, 0, $this->find_len) == $this->find; } } class phpMorphy_Hunspell_Suffix extends phpMorphy_Hunspell_Affix { protected function getRegExp($find) { //return $find; return "~{$find}$~iu"; } function generateWord($word) { if(!$this->isMatch($word)) { return false; } if($this->remove_len && mb_strlen($word) >= $this->remove_len) { $tail = mb_substr($word, -$this->remove_len); if($tail != $this->remove) { vd("Try to remove $tail from $word"); vd($this); exit; } $word = mb_substr($word, 0, -$this->remove_len); } return "$word{$this->append}"; } protected function simpleMatch($word) { return mb_substr($word, -$this->find_len) == $this->find; } } abstract class phpMorphy_Hunspell_AffixFlag { protected $name, $cross_product, $affixes = array(); protected function __construct($name, $cross) { $this->name = $name; $this->cross_product = $cross; } static function create($type, $name, $cross) { $affix_class = $type == 'SFX' ? 'phpMorphy_Hunspell_SuffixFlag' : 'phpMorphy_Hunspell_PrefixFlag'; return new $affix_class($name, $cross); } function getName() { return $this->name; } function isCrossProduct() { return $this->cross_product; } function generateWords($word, &$words, $wordMorph = null, &$morphs = null) { $maxRemoveLength = 0; foreach($this->affixes as $affix) { if(false !== ($new_word = $affix->generateWord($word))) { $words[] = $new_word; if(isset($morphs)) { $morphs[] = $wordMorph . $affix->getMorphDescription(); } $maxRemoveLength = max($maxRemoveLength, $affix->getRemoveLength()); } } return $maxRemoveLength; } function addAffix($find, $remove, $append, $morph = null) { $this->affixes[] = $this->createAffix( $find, $remove, $append, $morph ); } abstract protected function createAffix($find, $remove, $append, $morph); abstract function isSuffix(); } class phpMorphy_Hunspell_SuffixFlag extends phpMorphy_Hunspell_AffixFlag { protected function createAffix($find, $remove, $append, $morph) { return new phpMorphy_Hunspell_Suffix( $find, $remove, $append, $morph ); } function isSuffix() { return true; } } class phpMorphy_Hunspell_PrefixFlag extends phpMorphy_Hunspell_AffixFlag { protected function createAffix($find, $remove, $append, $morph) { return new phpMorphy_Hunspell_Prefix( $find, $remove, $append, $morph ); } function isSuffix() { return false; } } class phpMorphy_Hunspell_AffixFile_Reader extends phpMorphy_Iterator_Transform { function __construct($fileName, $defaultEncoding) { $obj = $this->createIterators($fileName); parent::__construct($this->createIterators($fileName)); $this->setEncoding($defaultEncoding); } function setEncoding($enc) { $this->getInnerIterator()->setEncoding($enc); } protected function createIterators($fileName) { return new phpMorphy_Iterator_Iconv( new phpMorphy_Iterator_NotEmptyLines( $this->createFileIterator($fileName) ) ); } protected function createFileIterator($fileName) { return new SplFileObject($fileName); } protected function transformItem($item, $key) { return explode( ' ', preg_replace('~\s{2,}~', ' ', trim($item)) ); } } class phpMorphy_Hunspell_AffixFile { protected $flags = array(), $options = array(); function __construct($fileName, $options = array()) { $this->options = $options; $this->parseFile($fileName); } function isFlagExists($name) { return array_key_exists($name, $this->flags); } function getFlag($name) { if(!$this->isFlagExists($name)) { throw new phpMorphy_Hunspell_Exception("Unknown $name flag"); return false; } return $this->flags[$name]; } function getOptions() { return $this->options; } function isOptionExists($name) { return array_key_exists($name, $this->options); } function getOption($name) { if(!$this->isOptionExists($name)) { throw new phpMorphy_Hunspell_Exception("Unknown $name option"); } return $this->options[$name]; } function getEncoding() { try { return $this->getOption('SET'); } catch(Exception $e) { throw new phpMorphy_Hunspell_Exception("Can`t return encoding, because SET option not exists"); } } protected function parseFile($fileName) { $default_enc = $this->isOptionExists('SET') ? $this->getOption('SET') : null; $reader = $this->createAffixReader($fileName, $default_enc); $reader->rewind(); try { while($reader->valid()) { $tokens = $reader->current(); $this->processLine($tokens, $reader); $reader->next(); // HACK: $this->options['SET'] for perfomance if(!isset($default_enc) && isset($this->options['SET'])) { $default_enc = $this->getOption('SET'); $reader->setEncoding($default_enc); } } } catch(Exception $e) { throw new phpMorphy_Hunspell_Exception("Can`t parse $fileName affix file, error at " . $reader->key() . " line: " . $e->getMessage()); } } protected function createAffixReader($fileName, $defaultEncoding) { return new phpMorphy_Hunspell_AffixFile_Reader($fileName, $defaultEncoding); } protected function processLine($tokens, Iterator $reader) { $type = $tokens[0]; if($type == 'SFX' || $type == 'PFX') { if(count($tokens) < 4) { throw new phpMorphy_Hunspell_Exception("Invalid affix header"); } $this->readAffixBlock($reader, $type, $tokens[1], $tokens[3], $tokens[2]); } else { array_shift($tokens); $this->handleOption($type, $tokens); } } protected function readAffixBlock(Iterator $reader, $type, $flagName, $count, $crossProduct) { $affix_flag = $this->createAffixFlag($type, $flagName, $crossProduct == 'Y'); for($i = 0; $i < $count; $i++) { $reader->next(); if(!$reader->valid()) { throw new phpMorphy_Hunspell_Exception("Unexpected file end while reading '" . $flagName . "' flag, " . ($count - $i) . " items needed"); } $tokens = $reader->current(); if(count($tokens) < 5 || $tokens[0] != $type || $tokens[1] != $flagName) { throw new phpMorphy_Hunspell_Exception("Invalid line type given, proper affix expected"); } $append = $tokens[3] == '0' ? '' : $tokens[3]; if(strpos($append, '/') !== false) { throw new phpMorphy_Hunspell_Exception("Affix continuation not supported"); } $affix_flag->addAffix( $tokens[4], $tokens[2] == '0' ? '' : $tokens[2], $append, isset($tokens[5]) ? $tokens[5] : null ); } $this->flags[$flagName] = $affix_flag; } protected function createAffixFlag($type, $flagName, $crossProduct) { return phpMorphy_Hunspell_AffixFlag::create( $type, $flagName, $crossProduct == 'Y' ); } protected function handleOption($type, $options) { if(!$this->isAllowedOption($type, $options)) { throw new phpMorphy_Hunspell_Exception("Sorry, option '$type' not supported now"); } if(count($options) == 1) { $options = $options[0]; } /* if(!array_key_exists($type, $this->options)) { $this->options[$type] = $options; } */ $this->options[$type] = $options; } protected function isAllowedOption($type, $options) { return !in_array( $type, array( 'FLAG', // FLAGS not supported 'AF', 'AM' ) ); } } class phpMorphy_Hunspell_DictFile_Reader extends phpMorphy_Iterator_Transform { function __construct($fileName, $encoding) { parent::__construct($this->createIterators($fileName, $encoding)); } protected function createIterators($fileName, $encoding) { return new phpMorphy_Iterator_Iconv( new phpMorphy_Iterator_NotEmptyLines($this->createFileIterator($fileName)), $encoding ); } protected function createFileIterator($fileName) { return new SplFileObject($fileName); } protected function transformItem($item, $key) { $line = trim($item); $word = ''; $flags = ''; $morph = ''; if(false !== ($pos = mb_strpos($line, "\t"))) { $morph = trim(mb_substr($line, $pos + 1)); $line = rtrim(mb_substr($line, 0, $pos)); } if(false !== ($pos = mb_strpos($line, '/'))) { $word = rtrim(mb_substr($line, 0, $pos)); $flags = ltrim(mb_substr($line, $pos + 1)); } else { $word = $line; } return array( 'word' => $word, 'flags' => $this->parseFlags($flags), 'morph' => $morph ); } protected function parseFlags($flags) { // TODO: May be long(two chars?) or numeric format(aka compressed) // But i support only basic syntax now return strlen($flags) ? str_split($flags) : array(); } } class phpMorphy_Hunspell_DictFile { protected $file_name, $affix, $encoding ; function __construct($fileName, phpMorphy_Hunspell_AffixFile $affixFile, $encoding = null) { $this->file_name = $fileName; $this->affix = $affixFile; if($encoding === null) { try { $encoding = $affixFile->getEncoding(); } catch(Exception $e) { throw new phpMorphy_Hunspell_Exception("You must explicit specifiy encoding, because affix file dosn`t contain encoding"); } } $this->encoding = $encoding; } protected function createDictReader() { return new phpMorphy_Hunspell_DictFile_Reader($this->file_name, $this->encoding); } function export($callback) { $reader = $this->createDictReader(); $reader->rewind(); if($reader->valid()) { $tokens = $reader->current(); if(preg_match('~^[0-9]+$~', $tokens['word'])) { $reader->next(); } } while($reader->valid()) { $result = $reader->current(); $reader->next(); $all_words = $this->generateWordForms($result['word'], $result['morph'], $result['flags']); if(false === call_user_func($callback, $result['word'], $all_words['lemma'], $all_words['words'], $all_words['morphs'])) { break; } } } protected function generateWordForms($base, $baseMorph, $flagsList) { $prefix_flags = array(); $suffix_flags = array(); foreach($flagsList as $flag) { if($this->affix->isFlagExists($flag)) { $flag_obj = $this->affix->getFlag($flag); if($flag_obj->isSuffix()) { $suffix_flags[$flag] = $flag_obj; } else { $prefix_flags[$flag] = $flag_obj; } } } $words = array($base); $morphs = array($baseMorph); $lemma = ''; // process prefixes $max_prefix_removed = $this->generateWordsForAffixes($base, $prefix_flags, $words, $baseMorph, $morphs); // process suffixes $max_suffix_removed = $this->generateWordsForAffixes($base, $suffix_flags, $words, $baseMorph, $morphs); if($max_suffix_removed) { $lemma = mb_substr($base, $max_prefix_removed, -$max_suffix_removed); } else { $lemma = mb_substr($base, $max_prefix_removed); } // process cross product if(count($prefix_flags) && count($suffix_flags)) { foreach($prefix_flags as $prefix) { if($prefix->isCrossProduct()) { $prefixed_bases = array(); $prefixed_morphs = array(); $prefix->generateWords($base, $prefixed_bases, $baseMorph, $prefixed_morphs); if(count($prefixed_bases)) { foreach($suffix_flags as $suffix) { if($suffix->isCrossProduct()) { $i = 0; foreach($prefixed_bases as $prefixed_base) { $suffix->generateWords($prefixed_base, $words, $prefixed_morphs[$i], $morphs); $i++; } } } } } } } return array( 'words' => $words, 'morphs' => $morphs, 'lemma' => $lemma ); } protected function generateWordsForAffixes($base, $affixes, &$words, $wordMorph, &$morphs) { $max_removed = 0; foreach($affixes as $affix) { $removed_length = $affix->generateWords($base, $words, $wordMorph, $morphs); $max_removed = max($removed_length, $max_removed); } return $max_removed; } }
| ver. 1.4 |
Github
|
.
| PHP 5.4.45-1~dotdeb+6.1 | Генерация страницы: 0 |
proxy
|
phpinfo
|
Настройка