| Current Path : /var/www/iplanru/data/www/i-plan.ru/libraries/phpmorphy/src/ |
| Current File : /var/www/iplanru/data/www/i-plan.ru/libraries/phpmorphy/src/morphiers.php |
<?php
/**
* This file is part of phpMorphy library
*
* Copyright c 2007-2008 Kamaev Vladimir <heromantor@users.sourceforge.net>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
require_once(PHPMORPHY_DIR . '/gramtab.php');
require_once(PHPMORPHY_DIR . '/unicode.php');
// ----------------------------
// Morphier interface
// ----------------------------
interface phpMorphy_Morphier_Interface {
function getAnnot($word);
function getBaseForm($word);
function getAllForms($word);
function getPseudoRoot($word);
function getPartOfSpeech($word);
function getWordDescriptor($word);
function getAllFormsWithAncodes($word);
function getAncode($word);
function getGrammarInfoMergeForms($word);
function getGrammarInfo($word);
}
class phpMorphy_Morphier_Empty implements phpMorphy_Morphier_Interface {
function getAnnot($word) { return false; }
function getBaseForm($word) { return false; }
function getAllForms($word) { return false; }
function getAllFormsWithGramInfo($word) { return false; }
function getPseudoRoot($word) { return false; }
function getPartOfSpeech($word) { return false; }
function getWordDescriptor($word) { return false; }
function getAllFormsWithAncodes($word) { return false; }
function getAncode($word) { return false; }
function getGrammarInfoMergeForms($word) { return false; }
function getGrammarInfo($word) { return false; }
function castFormByGramInfo($word, $partOfSpeech, $grammems, $returnWords = false, $callback = null) { return false; }
}
// ----------------------------
// Annot decoder
// ----------------------------
interface phpMorphy_AnnotDecoder_Interface {
function decode($annotsRaw, $withBase);
};
abstract class phpMorphy_AnnotDecoder_Base implements phpMorphy_AnnotDecoder_Interface {
const INVALID_ANCODE_ID = 0xFFFF;
protected
$ends,
$unpack_str,
$block_size;
function __construct($ends) {
$this->ends = $ends;
$this->unpack_str = $this->getUnpackString();
$this->block_size = $this->getUnpackBlockSize();
}
abstract protected function getUnpackString();
abstract protected function getUnpackBlockSize();
function decode($annotRaw, $withBase) {
if(empty($annotRaw)) {
throw new phpMorphy_Exception("Empty annot given");
}
$unpack_str = $this->unpack_str;
$unpack_size = $this->block_size;
$result = unpack("Vcount/$unpack_str", $annotRaw);
if(false === $result) {
throw new phpMorphy_Exception("Invalid annot string '$annotRaw'");
}
if($result['common_ancode'] == self::INVALID_ANCODE_ID) {
$result['common_ancode'] = null;
}
$count = $result['count'];
$result = array($result);
if($count > 1) {
for($i = 0; $i < $count - 1; $i++) {
$res = unpack($unpack_str, $GLOBALS['__phpmorphy_substr']($annotRaw, 4 + ($i + 1) * $unpack_size, $unpack_size));
if($res['common_ancode'] == self::INVALID_ANCODE_ID) {
$res['common_ancode'] = null;
}
$result[] = $res;
}
}
if($withBase) {
$items = explode($this->ends, $GLOBALS['__phpmorphy_substr']($annotRaw, 4 + $count * $unpack_size));
for($i = 0; $i < $count; $i++) {
$result[$i]['base_prefix'] = $items[$i * 2];
$result[$i]['base_suffix'] = $items[$i * 2 + 1];
}
}
return $result;
}
}
class phpMorphy_AnnotDecoder_Common extends phpMorphy_AnnotDecoder_Base {
protected function getUnpackString() {
return 'Voffset/vcplen/vplen/vflen/vcommon_ancode/vforms_count/vpacked_forms_count/vaffixes_size/vform_no/vpos_id';
// return 'Voffset/vcplen/vplen/vflen/vcommon_ancode/vforms_count/vpacked_forms_count/vaffixes_size/vpos_id';
}
protected function getUnpackBlockSize() {
return 22;
}
}
class phpMorphy_AnnotDecoder_Predict extends phpMorphy_AnnotDecoder_Common {
protected function getUnpackString() {
// return 'Voffset/vcplen/vplen/vflen/vcommon_ancode/vforms_count/vpacked_forms_count/vaffixes_size/vform_no/vpos_id/vfreq';
return parent::getUnpackString() . '/vfreq';
}
protected function getUnpackBlockSize() {
return parent::getUnpackBlockSize() + 2;
}
}
class phpMorphy_AnnotDecoder_Factory {
protected static $instances = array();
protected
$cache_common,
$cache_predict,
$eos;
protected function __construct($eos) {
$this->eos = $eos;
}
static function create($eos) {
if(!isset(self::$instances[$eos])) {
self::$instances[$eos] = new phpMorphy_AnnotDecoder_Factory($eos);
}
return self::$instances[$eos];
}
function getCommonDecoder() {
if(!isset($this->cache_common)) {
$this->cache_common = $this->instantinate('common');
}
return $this->cache_common;
}
function getPredictDecoder() {
if(!isset($this->cache_predict)) {
$this->cache_predict = $this->instantinate('predict');
}
return $this->cache_predict;
}
protected function instantinate($type) {
$clazz = 'phpMorphy_AnnotDecoder_' . ucfirst($GLOBALS['__phpmorphy_strtolower']($type));
return new $clazz($this->eos);
}
}
interface phpMorphy_AncodesResolver_Interface {
function resolve($ancodeId);
function unresolve($ancode);
}
class phpMorphy_AncodesResolver_Proxy implements phpMorphy_AncodesResolver_Interface {
protected
$args, $class;
//$__obj;
function __construct($class, $ctorArgs) {
$this->class = $class;
$this->args = $ctorArgs;
}
function unresolve($ancode) {
return $this->__obj->unresolve($ancode);
}
function resolve($ancodeId) {
return $this->__obj->resolve($ancodeId);
}
static function instantinate($class, $args) {
$ref = new ReflectionClass($class);
return $ref->newInstanceArgs($args);
}
function __get($propName) {
if($propName === '__obj') {
$this->__obj = $this->instantinate($this->class, $this->args);
unset($this->args);
unset($this->class);
return $this->__obj;
}
throw new phpMorphy_Exception("Unknown '$propName' property");
}
}
class phpMorphy_AncodesResolver_ToText implements phpMorphy_AncodesResolver_Interface {
protected $gramtab;
function __construct(phpMorphy_GramTab_Interface $gramtab) {
$this->gramtab = $gramtab;
}
function resolve($ancodeId) {
if(!isset($ancodeId)) {
return null;
}
return $this->gramtab->ancodeToString($ancodeId);
}
function unresolve($ancode) {
return $this->gramtab->stringToAncode($ancode);
//throw new phpMorphy_Exception("Can`t convert grammar info in text into ancode id");
}
}
class phpMorphy_AncodesResolver_ToDialingAncodes implements phpMorphy_AncodesResolver_Interface {
protected
$ancodes_map,
$reverse_map;
function __construct(phpMorphy_Storage $ancodesMap) {
if(false === ($this->ancodes_map = unserialize($ancodesMap->read(0, $ancodesMap->getFileSize())))) {
throw new phpMorphy_Exception("Can`t open phpMorphy => Dialing ancodes map");
}
$this->reverse_map = array_flip($this->ancodes_map);
}
function unresolve($ancode) {
if(!isset($ancode)) {
return null;
}
if(!isset($this->reverse_map[$ancode])) {
throw new phpMorphy_Exception("Unknwon ancode found '$ancode'");
}
return $this->reverse_map[$ancode];
}
function resolve($ancodeId) {
if(!isset($ancodeId)) {
return null;
}
if(!isset($this->ancodes_map[$ancodeId])) {
throw new phpMorphy_Exception("Unknwon ancode id found '$ancodeId'");
}
return $this->ancodes_map[$ancodeId];
}
}
class phpMorphy_AncodesResolver_AsIs implements phpMorphy_AncodesResolver_Interface {
// This ctor for ReflectionClass::newInstanceArgs($args) with $args = array()
function __construct() {
}
function resolve($ancodeId) {
return $ancodeId;
}
function unresolve($ancode) {
return $ancode;
}
}
// ----------------------------
// Helper
// ----------------------------
class phpMorphy_Morphier_Helper {
protected
$graminfo,
$annot_decoder,
$char_size,
$ends,
$gramtab,
$ancodes_resolver,
$gramtab_consts_included = false,
$resolve_pos;
function __construct(
phpMorphy_GramInfo_Interace $graminfo,
phpMorphy_GramTab_Interface $gramtab,
phpMorphy_AncodesResolver_Interface $ancodesResolver,
$resolvePartOfSpeech
) {
$this->graminfo = $graminfo;
$this->gramtab = $gramtab;
$this->resolve_pos = (bool)$resolvePartOfSpeech;
$this->ancodes_resolver = $ancodesResolver;
$this->char_size = $graminfo->getCharSize();
$this->ends = $graminfo->getEnds();
}
function setAnnotDecoder(phpMorphy_AnnotDecoder_Interface $annotDecoder) {
$this->annot_decoder = $annotDecoder;
}
// getters
function getEndOfString() {
return $this->ends;
}
function getCharSize() {
return $this->char_size;
}
function hasAnnotDecoder() {
return isset($this->annot_decoder);
}
function getAnnotDecoder() {
return $this->annot_decoder;
}
function getAncodesResolver() {
return $this->ancodes_resolver;
}
function getGramInfo() {
return $this->graminfo;
}
function getGramTab() {
return $this->gramtab;
}
function isResolvePartOfSpeech() {
return $this->resolve_pos;
}
// other
function resolvePartOfSpeech($posId) {
return $this->gramtab->resolvePartOfSpeechId($posId);
}
function getGrammems($ancodeId) {
return $this->gramtab->getGrammems($ancodeId);
}
function getGrammemsAndPartOfSpeech($ancodeId) {
return array(
$this->gramtab->getPartOfSpeech($ancodeId),
$this->gramtab->getGrammems($ancodeId)
);
}
function extractPartOfSpeech($annot) {
if($this->resolve_pos) {
return $this->resolvePartOfSpeech($annot['pos_id']);
} else {
return $annot['pos_id'];
}
}
protected function includeGramTabConsts() {
if($this->isResolvePartOfSpeech()) {
$this->gramtab->includeConsts();
}
$this->gramtab_consts_included = true;
}
// getters
function getWordDescriptor($word, $annots) {
if(!$this->gramtab_consts_included) {
$this->includeGramTabConsts();
}
return new phpMorphy_WordDescriptor_Collection($word, $annots, $this);
}
protected function getBaseAndPrefix($word, $cplen, $plen, $flen) {
if($flen) {
$base = $GLOBALS['__phpmorphy_substr']($word, $cplen + $plen, -$flen);
} else {
if($cplen || $plen) {
$base = $GLOBALS['__phpmorphy_substr']($word, $cplen + $plen);
} else {
$base = $word;
}
}
$prefix = $cplen ? $GLOBALS['__phpmorphy_substr']($word, 0, $cplen) : '';
return array($base, $prefix);
}
function getPartOfSpeech($word, $annots) {
if(false === $annots) {
return false;
}
$result = array();
foreach($this->decodeAnnot($annots, false) as $annot) {
$result[$this->extractPartOfSpeech($annot)] = 1;
}
return array_keys($result);
}
function getBaseForm($word, $annots) {
if(false === $annots) {
return false;
}
$annots = $this->decodeAnnot($annots, true);
return $this->composeBaseForms($word, $annots);
}
function getPseudoRoot($word, $annots) {
if(false === $annots) {
return false;
}
$annots = $this->decodeAnnot($annots, false);
$result = array();
foreach($annots as $annot) {
list($base) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
$result[$base] = 1;
}
return array_keys($result);
}
function getAllForms($word, $annots) {
if(false === $annots) {
return false;
}
$annots = $this->decodeAnnot($annots, false);
return $this->composeForms($word, $annots);
}
function castFormByGramInfo($word, $annots, $partOfSpeech, $grammems, $returnWords = false, $callback = null) {
if(false === $annots) {
return false;
}
if(isset($callback) && !is_callable($callback)) {
throw new phpMorphy_Exception("Invalid callback given");
}
$result = array();
$grammems = (array)$grammems;
$partOfSpeech = isset($partOfSpeech) ? (string)$partOfSpeech : null;
foreach($this->decodeAnnot($annots, false) as $annot) {
$all_ancodes = $this->graminfo->readAncodes($annot);
$flexias = $this->graminfo->readFlexiaData($annot);
$common_ancode = $annot['common_ancode'];
$common_grammems = isset($common_ancode) ? $this->gramtab->getGrammems($common_ancode) : array();
list($base, $prefix) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
// i use strange $form_no handling for perfomance issue (no function call overhead)
$i = 0;
$form_no = 0;
foreach($all_ancodes as $form_ancodes) {
foreach($form_ancodes as $ancode) {
$form_pos = $this->gramtab->getPartOfSpeech($ancode);
$form_grammems = array_merge($this->gramtab->getGrammems($ancode), $common_grammems);
$form = $prefix . $flexias[$i] . $base . $flexias[$i + 1];
if(isset($callback)) {
if(!call_user_func($callback, $form, $form_pos, $form_grammems, $form_no)) {
$form_no++;
continue;
}
} else {
if(isset($partOfSpeech) && $form_pos !== $partOfSpeech) {
$form_no++;
continue;
}
if(count(array_diff($grammems, $form_grammems)) > 0) {
$form_no++;
continue;
}
}
if($returnWords) {
$result[$form] = 1;
} else {
$result[] = array(
'form' => $form,
'form_no' => $form_no,
'pos' => $form_pos,
'grammems' => $form_grammems
);
}
$form_no++;
}
$i += 2;
}
}
return $returnWords ? array_keys($result) : $result;
}
function getAncode($annots) {
if(false === $annots) {
return false;
}
$result = array();
foreach($this->decodeAnnot($annots, false) as $annot) {
$all_ancodes = $this->graminfo->readAncodes($annot);
$result[] = array(
'common' => $this->ancodes_resolver->resolve($annot['common_ancode']),
'all' => array_map(
array($this->ancodes_resolver, 'resolve'),
$all_ancodes[$annot['form_no']]
)
);
}
return $this->array_unique($result);
}
protected static function array_unique($array) {
static $need_own;
if(!isset($need_own)) {
$need_own = -1 === version_compare(PHP_VERSION, '5.2.9');
}
if($need_own) {
$result = array();
foreach(array_keys(array_unique(array_map('serialize', $array))) as $key) {
$result[$key] = $array[$key];
}
return $result;
} else {
return array_unique($array, SORT_REGULAR);
}
}
function getGrammarInfoMergeForms($annots) {
if(false === $annots) {
return false;
}
$result = array();
foreach($this->decodeAnnot($annots, false) as $annot) {
$all_ancodes = $this->graminfo->readAncodes($annot);
$common_ancode = $annot['common_ancode'];
$grammems = isset($common_ancode) ? $this->gramtab->getGrammems($common_ancode) : array();
$forms_count = 0;
$form_no = $annot['form_no'];
foreach($all_ancodes[$form_no] as $ancode) {
$grammems = array_merge($grammems, $this->gramtab->getGrammems($ancode));
$forms_count++;
}
$grammems = array_unique($grammems);
sort($grammems);
$result[] = array(
// part of speech identical across all joined forms
'pos' => $this->gramtab->getPartOfSpeech($ancode),
'grammems' => $grammems,
'forms_count' => $forms_count,
'form_no_low' => $form_no,
'form_no_high' => $form_no + $forms_count,
);
}
return $this->array_unique($result);
}
function getGrammarInfo($annots) {
if(false === $annots) {
return false;
}
$result = array();
foreach($this->decodeAnnot($annots, false) as $annot) {
$all_ancodes = $this->graminfo->readAncodes($annot);
$common_ancode = $annot['common_ancode'];
$common_grammems = isset($common_ancode) ? $this->gramtab->getGrammems($common_ancode) : array();
$info = array();
$form_no = $annot['form_no'];
foreach($all_ancodes[$form_no] as $ancode) {
$grammems = //array_unique(
array_merge($common_grammems, $this->gramtab->getGrammems($ancode));
//);
sort($grammems);
$info_item = array(
'pos' => $this->gramtab->getPartOfSpeech($ancode),
'grammems' => $grammems,
'form_no' => $form_no,
);
$info[] = $info_item;
}
$unique_info = $this->array_unique($info);
sort($unique_info);
$result[] = $unique_info;
}
return $this->array_unique($result);
}
function getAllFormsWithResolvedAncodes($word, $annots, $resolveType = 'no_resolve') {
if(false === $annots) {
return false;
}
$annots = $this->decodeAnnot($annots, false);
return $this->composeFormsWithResolvedAncodes($word, $annots);
}
function getAllFormsWithAncodes($word, $annots, &$foundFormNo = array()) {
if(false === $annots) {
return false;
}
$annots = $this->decodeAnnot($annots, false);
return $this->composeFormsWithAncodes($word, $annots, $foundFormNo);
}
function getAllAncodes($word, $annots) {
if(false === $annots) {
return false;
}
$result = array();
foreach($annots as $annot) {
$result[] = $this->graminfo->readAncodes($annot);
}
return $result;
}
protected function composeBaseForms($word, $annots) {
$result = array();
foreach($annots as $annot) {
if($annot['form_no'] > 0) {
list($base, $prefix) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
$result[$prefix . $annot['base_prefix'] . $base . $annot['base_suffix']] = 1;
} else {
$result[$word] = 1;
}
}
return array_keys($result);
}
protected function composeForms($word, $annots) {
$result = array();
foreach($annots as $annot) {
list($base, $prefix) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
// read flexia
$flexias = $this->graminfo->readFlexiaData($annot);
for($i = 0, $c = count($flexias); $i < $c; $i += 2) {
$result[$prefix . $flexias[$i] . $base . $flexias[$i + 1]] = 1;
}
}
return array_keys($result);
}
protected function composeFormsWithResolvedAncodes($word, $annots) {
$result = array();
foreach($annots as $annotIdx => $annot) {
list($base, $prefix) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
$words = array();
$ancodes = array();
$common_ancode = $annot['common_ancode'];
// read flexia
$flexias = $this->graminfo->readFlexiaData($annot);
$all_ancodes = $this->graminfo->readAncodes($annot);
for($i = 0, $c = count($flexias); $i < $c; $i += 2) {
$form = $prefix . $flexias[$i] . $base . $flexias[$i + 1];
$current_ancodes = $all_ancodes[$i / 2];
foreach($current_ancodes as $ancode) {
$words[] = $form;
$ancodes[] = $this->ancodes_resolver->resolve($ancode);
}
}
$result[] = array(
'forms' => $words,
'common' => $this->ancodes_resolver->resolve($common_ancode),
'all' => $ancodes,
);
}
return $result;
}
protected function composeFormsWithAncodes($word, $annots, &$foundFormNo) {
$result = array();
foreach($annots as $annotIdx => $annot) {
list($base, $prefix) = $this->getBaseAndPrefix(
$word,
$annot['cplen'],
$annot['plen'],
$annot['flen']
);
// read flexia
$flexias = $this->graminfo->readFlexiaData($annot);
$ancodes = $this->graminfo->readAncodes($annot);
$found_form_no = $annot['form_no'];
$foundFormNo = !is_array($foundFormNo) ? array() : $foundFormNo;
for($i = 0, $c = count($flexias); $i < $c; $i += 2) {
$form_no = $i / 2;
$word = $prefix . $flexias[$i] . $base . $flexias[$i + 1];
if($found_form_no == $form_no) {
$count = count($result);
$foundFormNo[$annotIdx]['low'] = $count;
$foundFormNo[$annotIdx]['high'] = $count + count($ancodes[$form_no]) - 1;
}
foreach($ancodes[$form_no] as $ancode) {
$result[] = array($word, $ancode);
}
}
}
return $result;
}
function decodeAnnot($annotsRaw, $withBase) {
if(is_array($annotsRaw)) {
return $annotsRaw;
} else {
return $this->annot_decoder->decode($annotsRaw, $withBase);
}
}
}
// ----------------------------
// WordDescriptor
// ----------------------------
// TODO: extend ArrayObject?
class phpMorphy_WordDescriptor_Collection implements Countable, IteratorAggregate, ArrayAccess {
protected
$word,
$descriptors = array(),
$helper;
function __construct($word, $annots, phpMorphy_Morphier_Helper $helper) {
$this->word = (string)$word;
$this->annots = false === $annots ? false : $helper->decodeAnnot($annots, true);
$this->helper = $helper;
if(false !== $this->annots) {
foreach($this->annots as $annot) {
$this->descriptors[] = $this->createDescriptor($word, $annot, $helper);
}
}
}
protected function createDescriptor($word, $annot, phpMorphy_Morphier_Helper $helper) {
return new phpMorphy_WordDescriptor($word, $annot, $helper);
}
function getDescriptor($index) {
if(!$this->offsetExists($index)) {
throw new phpMorphy_Exception("Invalid index '$index' specified");
}
return $this->descriptors[$index];
}
function getByPartOfSpeech($poses) {
$result = array();
settype($poses, 'array');
foreach($this as $desc) {
if($desc->hasPartOfSpeech($poses)) {
$result[] = $desc;
}
}
// return count($result) ? $result : false;
return $result;
}
function offsetExists($off) {
return isset($this->descriptors[$off]);
}
function offsetUnset($off) {
throw new phpMorphy_Exception(__CLASS__ . " is not mutable");
}
function offsetSet($off, $value) {
throw new phpMorphy_Exception(__CLASS__ . " is not mutable");
}
function offsetGet($off) {
return $this->getDescriptor($off);
}
function count() {
return count($this->descriptors);
}
function getIterator() {
return new ArrayIterator($this->descriptors);
}
}
class phpMorphy_WordForm {
protected
$word,
$form_no,
$pos_id,
$grammems
;
function __construct($word, $form_no, $pos_id, $grammems) {
$this->word = (string)$word;
$this->form_no = (int)$form_no;
$this->pos_id = $pos_id;
sort($grammems);
$this->grammems = $grammems;
}
function getPartOfSpeech() {
return $this->pos_id;
}
function getGrammems() {
return $this->grammems;
}
function hasGrammems($grammems) {
$grammems = (array)$grammems;
$grammes_count = count($grammems);
return $grammes_count && count(array_intersect($grammems, $this->grammems)) == $grammes_count;
}
static function compareGrammems($a, $b) {
return count($a) == count($b) && count(array_diff($a, $b)) == 0;
}
function getWord() {
return $this->word;
}
function getFormNo() {
return $this->form_no;
}
}
class phpMorphy_WordDescriptor implements Countable, ArrayAccess, IteratorAggregate {
protected
$word,
$annot,
$helper,
$cached_forms,
$cached_base,
$cached_pseudo_root,
$all_forms,
$found_form_no,
$common_ancode_grammems;
function __construct($word, $annot, phpMorphy_Morphier_Helper $helper) {
$this->word = (string)$word;
$this->annot = array($annot);
$this->helper = $helper;
}
function getPseudoRoot() {
if(!isset($this->cached_pseudo_root)) {
list($this->cached_pseudo_root) = $this->helper->getPseudoRoot($this->word, $this->annot);
}
return $this->cached_pseudo_root;
}
function getBaseForm() {
if(!isset($this->cached_base)) {
list($this->cached_base) = $this->helper->getBaseForm($this->word, $this->annot);
}
return $this->cached_base;
}
function getAllForms() {
if(!isset($this->cached_forms)) {
$this->cached_forms = $this->helper->getAllForms($this->word, $this->annot);
}
return $this->cached_forms;
}
function getWordForm($index) {
$this->readAllForms();
if(!$this->offsetExists($index)) {
throw new phpMorphy_Exception("Invalid index '$index' given");
}
return $this->all_forms[$index];
}
protected function createWordForm($word, $form_no, $ancode) {
if(!isset($this->common_ancode_grammems)) {
$common_ancode = $this->annot[0]['common_ancode'];
$this->common_ancode_grammems = isset($common_ancode) ?
$this->helper->getGrammems($common_ancode) :
array();
}
list($pos_id, $all_grammems) = $this->helper->getGrammemsAndPartOfSpeech($ancode);
return new phpMorphy_WordForm($word, $form_no, $pos_id, array_merge($this->common_ancode_grammems, $all_grammems));
}
protected function readAllForms() {
if(!isset($this->all_forms)) {
$result = array();
$form_no = 0;
$found_form_no = array();
foreach($this->helper->getAllFormsWithAncodes($this->word, $this->annot, $found_form_no) as $form) {
$word = $form[0];
$result[] = $this->createWordForm($word, $form_no, $form[1]);
$form_no++;
}
$this->found_form_no = $found_form_no[0];
$this->all_forms = $result;
}
return $this->all_forms;
}
protected function getFoundFormNoLow() {
$this->readAllForms();
return $this->found_form_no['low'];
}
protected function getFoundFormNoHigh() {
$this->readAllForms();
return $this->found_form_no['high'];
}
function getFoundWordForm() {
$result = array();
for($i = $this->getFoundFormNoLow(), $c = $this->getFoundFormNoHigh() + 1; $i < $c; $i++) {
$result[] = $this->getWordForm($i);
}
return $result;
}
function hasGrammems($grammems) {
settype($grammems, 'array');
foreach($this as $wf) {
if($wf->hasGrammems($grammems)) {
return true;
}
}
return false;
}
function getWordFormsByGrammems($grammems) {
settype($grammems, 'array');
$result = array();
foreach($this as $wf) {
if($wf->hasGrammems($grammems)) {
$result[] = $wf;
}
}
return $result;
// return count($result) ? $result : false;
}
function hasPartOfSpeech($poses) {
settype($poses, 'array');
foreach($this as $wf) {
if(in_array($wf->getPartOfSpeech(), $poses, true)) {
return true;
}
}
return false;
}
function getWordFormsByPartOfSpeech($poses) {
settype($poses, 'array');
$result = array();
foreach($this as $wf) {
if(in_array($wf->getPartOfSpeech(), $poses, true)) {
$result[] = $wf;
}
}
return $result;
// return count($result) ? $result : false;
}
function count() {
return count($this->readAllForms());
}
function offsetExists($off) {
$this->readAllForms();
return isset($this->all_forms[$off]);
}
function offsetSet($off, $value) {
throw new phpMorphy_Exception(__CLASS__ . " is not mutable");
}
function offsetUnset($off) {
throw new phpMorphy_Exception(__CLASS__ . " is not mutable");
}
function offsetGet($off) {
return $this->getWordForm($off);
}
function getIterator() {
$this->readAllForms();
return new ArrayIterator($this->all_forms);
}
}
// ----------------------------
// Finders
// ----------------------------
interface phpMorphy_Morphier_Finder_Interface {
function findWord($word);
function decodeAnnot($raw, $withBase);
function getAnnotDecoder();
}
abstract class phpMorphy_Morphier_Finder_Base implements phpMorphy_Morphier_Finder_Interface {
protected
$annot_decoder,
$prev_word,
$prev_result = false;
function __construct(phpMorphy_AnnotDecoder_Interface $annotDecoder) {
$this->annot_decoder = $annotDecoder;
}
function findWord($word) {
if($this->prev_word === $word) {
return $this->prev_result;
}
$result = $this->doFindWord($word);
$this->prev_word = $word;
$this->prev_result = $result;
return $result;
}
function getAnnotDecoder() {
return $this->annot_decoder;
}
function decodeAnnot($raw, $withBase) {
return $this->annot_decoder->decode($raw, $withBase);
}
abstract protected function doFindWord($word);
}
class phpMorphy_Morphier_Finder_Common extends phpMorphy_Morphier_Finder_Base {
protected
$fsa,
$root;
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_AnnotDecoder_Interface $annotDecoder) {
parent::__construct($annotDecoder);
$this->fsa = $fsa;
$this->root = $this->fsa->getRootTrans();
}
function getFsa() {
return $this->fsa;
}
protected function doFindWord($word) {
$result = $this->fsa->walk($this->root, $word);
if(!$result['result'] || null === $result['annot']) {
return false;
}
return $result['annot'];
}
}
class phpMorphy_Morphier_Finder_Predict_Suffix extends phpMorphy_Morphier_Finder_Common {
protected
$min_suf_len,
$unicode;
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_AnnotDecoder_Interface $annotDecoder, $encoding, $minimalSuffixLength = 4) {
parent::__construct($fsa, $annotDecoder);
$this->min_suf_len = (int)$minimalSuffixLength;
$this->unicode = phpMorphy_UnicodeHelper::create($encoding);
}
protected function doFindWord($word) {
$word_len = $this->unicode->strlen($word);
if(!$word_len) {
return false;
}
for($i = 1, $c = $word_len - $this->min_suf_len; $i < $c; $i++) {
$word = $GLOBALS['__phpmorphy_substr']($word, $this->unicode->firstCharSize($word));
if(false !== ($result = parent::doFindWord($word))) {
break;
}
}
if($i < $c) {
//$known_len = $word_len - $i;
$unknown_len = $i;
return $result;
/*
return $this->fixAnnots(
$this->decodeAnnot($result, true),
$unknown_len
);
*/
} else {
return false;
}
}
protected function fixAnnots($annots, $len) {
for($i = 0, $c = count($annots); $i < $c; $i++) {
$annots[$i]['cplen'] = $len;
}
return $annots;
}
}
class phpMorphy_Morphier_PredictCollector extends phpMorphy_Fsa_WordsCollector {
protected
$used_poses = array(),
$annot_decoder,
$collected = 0;
function __construct($limit, phpMorphy_AnnotDecoder_Interface $annotDecoder) {
parent::__construct($limit);
$this->annot_decoder = $annotDecoder;
}
function collect($path, $annotRaw) {
if($this->collected > $this->limit) {
return false;
}
$used_poses =& $this->used_poses;
$annots = $this->decodeAnnot($annotRaw);
for($i = 0, $c = count($annots); $i < $c; $i++) {
$annot = $annots[$i];
$annot['cplen'] = $annot['plen'] = 0;
$pos_id = $annot['pos_id'];
if(isset($used_poses[$pos_id])) {
$result_idx = $used_poses[$pos_id];
if($annot['freq'] > $this->items[$result_idx]['freq']) {
$this->items[$result_idx] = $annot;
}
} else {
$used_poses[$pos_id] = count($this->items);
$this->items[] = $annot;
}
}
$this->collected++;
return true;
}
function clear() {
parent::clear();
$this->collected = 0;
$this->used_poses = array();
}
function decodeAnnot($annotRaw) {
return $this->annot_decoder->decode($annotRaw, true);
}
}
class phpMorphy_Morphier_Finder_Predict_Databse extends phpMorphy_Morphier_Finder_Common {
protected
$collector,
$unicode,
$graminfo,
$min_postfix_match;
function __construct(
phpMorphy_Fsa_Interface $fsa,
phpMorphy_AnnotDecoder_Interface $annotDecoder,
$encoding,
phpMorphy_GramInfo_Interace $graminfo,
$minPostfixMatch = 2,
$collectLimit = 32
) {
parent::__construct($fsa, $annotDecoder);
$this->graminfo = $graminfo;
$this->min_postfix_match = $minPostfixMatch;
$this->collector = $this->createCollector($collectLimit, $this->getAnnotDecoder());
$this->unicode = phpMorphy_UnicodeHelper::create($encoding);
}
protected function createAnnotDecoder() {
return phpmorphy_annot_decoder_new('predict');
}
protected function doFindWord($word) {
$rev_word = $this->unicode->strrev($word);
$result = $this->fsa->walk($this->root, $rev_word);
if($result['result'] && null !== $result['annot']) {
$annots = $result['annot'];
} else {
$match_len = $this->unicode->strlen($this->unicode->fixTrailing($GLOBALS['__phpmorphy_substr']($rev_word, 0, $result['walked'])));
if(null === ($annots = $this->determineAnnots($result['last_trans'], $match_len))) {
return false;
}
}
if(!is_array($annots)) {
$annots = $this->collector->decodeAnnot($annots);
}
return $this->fixAnnots($word, $annots);
}
protected function determineAnnots($trans, $matchLen) {
$annots = $this->fsa->getAnnot($trans);
if(null == $annots && $matchLen >= $this->min_postfix_match) {
$this->collector->clear();
$this->fsa->collect(
$trans,
$this->collector->getCallback()
);
$annots = $this->collector->getItems();
}
return $annots;
}
protected function fixAnnots($word, $annots) {
$result = array();
// remove all prefixes?
for($i = 0, $c = count($annots); $i < $c; $i++) {
$annot = $annots[$i];
$annot['cplen'] = $annot['plen'] = 0;
$flexias = $this->graminfo->readFlexiaData($annot, false);
$prefix = $flexias[$annot['form_no'] * 2];
$suffix = $flexias[$annot['form_no'] * 2 + 1];
$plen = $GLOBALS['__phpmorphy_strlen']($prefix);
$slen = $GLOBALS['__phpmorphy_strlen']($suffix);
if(
(!$plen || $GLOBALS['__phpmorphy_substr']($word, 0, $GLOBALS['__phpmorphy_strlen']($prefix)) === $prefix) &&
(!$slen || $GLOBALS['__phpmorphy_substr']($word, -$GLOBALS['__phpmorphy_strlen']($suffix)) === $suffix)
) {
$result[] = $annot;
}
}
return count($result) ? $result : false;
}
protected function createCollector($limit) {
return new phpMorphy_Morphier_PredictCollector($limit, $this->getAnnotDecoder());
}
}
// ----------------------------
// Morphiers
// ----------------------------
abstract class phpMorphy_Morphier_Base implements phpMorphy_Morphier_Interface {
protected
/**
* @var phpMorphy_Morphier_Finder_Interface
*/
$finder,
/**
* @var phpMorphy_Morphier_Helper
*/
$helper;
function __construct(phpMorphy_Morphier_Finder_Interface $finder, phpMorphy_Morphier_Helper $helper) {
$this->finder = $finder;
$this->helper = clone $helper;
$this->helper->setAnnotDecoder($finder->getAnnotDecoder());
}
/**
* @return phpMorphy_Morphier_Finder_Interface
*/
function getFinder() {
return $this->finder;
}
/**
* @return phpMorphy_Morphier_Helper
*/
function getHelper() {
return $this->helper;
}
function getAnnot($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->decodeAnnot($annots, true);
}
function getWordDescriptor($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getWordDescriptor($word, $annots);
}
function getAllFormsWithAncodes($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getAllFormsWithResolvedAncodes($word, $annots);
}
function getPartOfSpeech($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getPartOfSpeech($word, $annots);
}
function getBaseForm($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getBaseForm($word, $annots);
}
function getPseudoRoot($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getPseudoRoot($word, $annots);
}
function getAllForms($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getAllForms($word, $annots);
}
function getAncode($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getAncode($annots);
}
function getGrammarInfo($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getGrammarInfo($annots);
}
function getGrammarInfoMergeForms($word) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->getGrammarInfoMergeForms($annots);
}
function castFormByGramInfo($word, $partOfSpeech, $grammems, $returnOnlyWord = false, $callback = null) {
if(false === ($annots = $this->finder->findWord($word))) {
return false;
}
return $this->helper->castFormByGramInfo($word, $annots);
}
function castFormByPattern($word, $patternWord, $returnOnlyWord = false, $callback = null) {
if(false === ($orig_annots = $this->finder->findWord($word))) {
return false;
}
if(false === ($pattern_annots = $this->finder->findWord($patternWord))) {
return false;
}
return $this->helper->castFormByPattern(
$word, $orig_annots,
$patternWord, $pattern_annots,
$returnOnlyWord,
$callback
);
}
};
class phpMorphy_Morphier_Common extends phpMorphy_Morphier_Base {
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
parent::__construct(
new phpMorphy_Morphier_Finder_Common(
$fsa,
$this->createAnnotDecoder($helper)
),
$helper
);
}
protected function createAnnotDecoder(phpMorphy_Morphier_Helper $helper) {
return phpMorphy_AnnotDecoder_Factory::create($helper->getGramInfo()->getEnds())->getCommonDecoder();
}
};
class phpMorphy_Morphier_Predict_Suffix extends phpMorphy_Morphier_Base {
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
parent::__construct(
new phpMorphy_Morphier_Finder_Predict_Suffix(
$fsa,
$this->createAnnotDecoder($helper),
$helper->getGramInfo()->getEncoding(),
4
),
$helper
);
}
protected function createAnnotDecoder(phpMorphy_Morphier_Helper $helper) {
return phpMorphy_AnnotDecoder_Factory::create($helper->getGramInfo()->getEnds())->getCommonDecoder();
}
}
class phpMorphy_Morphier_Predict_Database extends phpMorphy_Morphier_Base {
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
parent::__construct(
new phpMorphy_Morphier_Finder_Predict_Databse(
$fsa,
$this->createAnnotDecoder($helper),
$helper->getGramInfo()->getEncoding(),
$helper->getGramInfo(),
2,
32
),
$helper
);
}
protected function createAnnotDecoder(phpMorphy_Morphier_Helper $helper) {
return phpMorphy_AnnotDecoder_Factory::create($helper->getGramInfo()->getEnds())->getPredictDecoder();
}
}
class phpMorphy_Morphier_Bulk implements phpMorphy_Morphier_Interface {
protected
$fsa,
$root_trans,
$helper,
$notfound = array(),
$graminfo;
function __construct(phpMorphy_Fsa_Interface $fsa, phpMorphy_Morphier_Helper $helper) {
$this->fsa = $fsa;
$this->root_trans = $fsa->getRootTrans();
$this->helper = clone $helper;
$this->helper->setAnnotDecoder($this->createAnnotDecoder($helper));
$this->graminfo = $helper->getGramInfo();
}
function getFsa() {
return $this->fsa;
}
function getHelper() {
return $this->helper;
}
function getGraminfo() {
return $this->graminfo;
}
function getNotFoundWords() {
return $this->notfound;
}
protected function createAnnotDecoder(phpMorphy_Morphier_Helper $helper) {
return new phpMorphy_AnnotDecoder_Common($helper->getGramInfo()->getEnds());
}
function getAnnot($word) {
$result = array();
foreach($this->findWord($word) as $annot => $words) {
$annot = $this->helper->decodeAnnot($annot, true);
foreach($words as $word) {
$result[$word][] = $annot;
}
}
return $result;
}
function getBaseForm($words) {
$annots = $this->findWord($words);
return $this->composeForms($annots, true, false, false);
}
function getAllForms($words) {
$annots = $this->findWord($words);
return $this->composeForms($annots, false, false, false);
}
function getPseudoRoot($words) {
$annots = $this->findWord($words);
return $this->composeForms($annots, false, true, false);
}
function getPartOfSpeech($words) {
$annots = $this->findWord($words);
return $this->composeForms($annots, false, false, true);
}
protected function processAnnotsWithHelper($words, $method, $callWithWord = false) {
$result = array();
foreach($this->findWord($words) as $annot_raw => $words) {
if($GLOBALS['__phpmorphy_strlen']($annot_raw) == 0) continue;
if($callWithWord) {
foreach($words as $word) {
$result[$word] = $this->helper->$method($word, $annot_raw);
}
} else {
$result_for_annot = $this->helper->$method($annot_raw);
foreach($words as $word) {
$result[$word] = $result_for_annot;
}
}
}
return $result;
}
function getAncode($words) {
return $this->processAnnotsWithHelper($words, 'getAncode');
}
function getGrammarInfoMergeForms($words) {
return $this->processAnnotsWithHelper($words, 'getGrammarInfoMergeForms');
}
function getGrammarInfo($words) {
return $this->processAnnotsWithHelper($words, 'getGrammarInfo');
}
function getAllFormsWithAncodes($words) {
return $this->processAnnotsWithHelper($words, 'getAllFormsWithResolvedAncodes', true);
}
function getWordDescriptor($word) {
return $this->processAnnotsWithHelper($words, 'getWordDescriptor', true);
}
protected function findWord($words) {
$unknown_words_annot = '';
$this->notfound = array();
list($labels, $finals, $dests) = $this->buildPatriciaTrie($words);
$annots = array();
$unknown_words_annot = '';
$stack = array(0, '', $this->root_trans);
$stack_idx = 0;
$fsa = $this->fsa;
// TODO: Improve this
while($stack_idx >= 0) {
$n = $stack[$stack_idx];
$path = $stack[$stack_idx + 1] . $labels[$n];
$trans = $stack[$stack_idx + 2];
$stack_idx -= 3; // TODO: Remove items from stack? (performance!!!)
$is_final = $finals[$n] > 0;
$result = false;
if(false !== $trans && $n > 0) {
$label = $labels[$n];
$result = $fsa->walk($trans, $label, $is_final);
if($GLOBALS['__phpmorphy_strlen']($label) == $result['walked']) {
$trans = $result['word_trans'];
} else {
$trans = false;
}
}
if($is_final) {
if(false !== $trans && isset($result['annot'])) {
$annots[$result['annot']][] = $path;
} else {
//$annots[$unknown_words_annot][] = $path;
$this->notfound[] = $path;
}
}
if(false !== $dests[$n]) {
foreach($dests[$n] as $dest) {
$stack_idx += 3;
$stack[$stack_idx] = $dest;
$stack[$stack_idx + 1] = $path;
$stack[$stack_idx + 2] = $trans;
}
}
}
return $annots;
}
protected function composeForms($annotsRaw, $onlyBase, $pseudoRoot, $partOfSpeech) {
$result = array();
// process found annotations
foreach($annotsRaw as $annot_raw => $words) {
if($GLOBALS['__phpmorphy_strlen']($annot_raw) == 0) continue;
foreach($this->helper->decodeAnnot($annot_raw, $onlyBase) as $annot) {
if(!($onlyBase || $pseudoRoot)) {
$flexias = $this->graminfo->readFlexiaData($annot);
}
$cplen = $annot['cplen'];
$plen = $annot['plen'];
$flen = $annot['flen'];
if($partOfSpeech) {
$pos_id = $this->helper->extractPartOfSpeech($annot);
}
foreach($words as $word) {
if($flen) {
$base = $GLOBALS['__phpmorphy_substr']($word, $cplen + $plen, -$flen);
} else {
if($cplen || $plen) {
$base = $GLOBALS['__phpmorphy_substr']($word, $cplen + $plen);
} else {
$base = $word;
}
}
$prefix = $cplen ? $GLOBALS['__phpmorphy_substr']($word, 0, $cplen) : '';
if($pseudoRoot) {
$result[$word][$base] = 1;
} else if($onlyBase) {
$form = $prefix . $annot['base_prefix'] . $base . $annot['base_suffix'];
$result[$word][$form] = 1;
} else if($partOfSpeech) {
$result[$word][$pos_id] = 1;
} else {
for($i = 0, $c = count($flexias); $i < $c; $i += 2) {
$form = $prefix . $flexias[$i] . $base . $flexias[$i + 1];
$result[$word][$form] = 1;
}
}
}
}
}
for($keys = array_keys($result), $i = 0, $c = count($result); $i < $c; $i++) {
$key = $keys[$i];
$result[$key] = array_keys($result[$key]);
}
return $result;
}
protected function buildPatriciaTrie($words) {
if(!is_array($words)) {
throw new phpMorphy_Exception("Words must be array");
}
sort($words);
$stack = array();
$prev_word = '';
$prev_word_len = 0;
$prev_lcp = 0;
$state_labels = array();
$state_finals = array();
$state_dests = array();
$state_labels[] = '';
$state_finals = '0';
$state_dests[] = array();
$node = 0;
foreach($words as $word) {
if($word == $prev_word) {
continue;
}
$word_len = $GLOBALS['__phpmorphy_strlen']($word);
// find longest common prefix
for($lcp = 0, $c = min($prev_word_len, $word_len); $lcp < $c && $word[$lcp] == $prev_word[$lcp]; $lcp++);
if($lcp == 0) {
$stack = array();
$new_state_id = count($state_labels);
$state_labels[] = $word;
$state_finals .= '1';
$state_dests[] = false;
$state_dests[0][] = $new_state_id;
$node = $new_state_id;
} else {
$need_split = true;
$trim_size = 0; // for split
if($lcp == $prev_lcp) {
$need_split = false;
$node = $stack[count($stack) - 1];
} elseif($lcp > $prev_lcp) {
if($lcp == $prev_word_len) {
$need_split = false;
} else {
$need_split = true;
$trim_size = $lcp - $prev_lcp;
}
$stack[] = $node;
} else {
$trim_size = $GLOBALS['__phpmorphy_strlen']($prev_word) - $lcp;
for($stack_size = count($stack) - 1; ;--$stack_size) {
$trim_size -= $GLOBALS['__phpmorphy_strlen']($state_labels[$node]);
if($trim_size <= 0) {
break;
}
if(count($stack) < 1) {
throw new phpMorphy_Exception('Infinite loop posible');
}
$node = array_pop($stack);
}
$need_split = $trim_size < 0;
$trim_size = abs($trim_size);
if($need_split) {
$stack[] = $node;
} else {
$node = $stack[$stack_size];
}
}
if($need_split) {
$node_key = $state_labels[$node];
// split
$new_node_id_1 = count($state_labels);
$new_node_id_2 = $new_node_id_1 + 1;
// new_node_1
$state_labels[] = $GLOBALS['__phpmorphy_substr']($node_key, $trim_size);
$state_finals .= $state_finals[$node];
$state_dests[] = $state_dests[$node];
// adjust old node
$state_labels[$node] = $GLOBALS['__phpmorphy_substr']($node_key, 0, $trim_size);
$state_finals[$node] = '0';
$state_dests[$node] = array($new_node_id_1);
// append new node, new_node_2
$state_labels[] = $GLOBALS['__phpmorphy_substr']($word, $lcp);
$state_finals .= '1';
$state_dests[] = false;
$state_dests[$node][] = $new_node_id_2;
$node = $new_node_id_2;
} else {
$new_node_id = count($state_labels);
$state_labels[] = $GLOBALS['__phpmorphy_substr']($word, $lcp);
$state_finals .= '1';
$state_dests[] = false;
if(false !== $state_dests[$node]) {
$state_dests[$node][] = $new_node_id;
} else {
$state_dests[$node] = array($new_node_id);
}
$node = $new_node_id;
}
}
$prev_word = $word;
$prev_word_len = $word_len;
$prev_lcp = $lcp;
}
return array($state_labels, $state_finals, $state_dests);
}
}