__author__ = 'wanghuafeng'
#coding:utf-8
import os
import sys
import codecs
from collections import deque
try:
PATH = os.path.dirname(os.path.abspath(__file__))
except:
PATH = os.getcwd()
class SlicerBase(object):
def __init__(self, options=None):
if not options:
options = {}
self.options = options
if not self.options.has_key('vocab_file') or not self.options['vocab_file']:
self.options['vocab_file'] = os.path.join(PATH, 'data', 'Cizu_and_singleword_komoxo95K.txt')
self.total_base_word_set = self._load_base_wordlist()
def _load_base_wordlist(self):
with codecs.open(self.options['vocab_file'], encoding='utf-8') as f:
total_base_word_set = set([item.split('\t')[0] for item in f.readlines()])
return total_base_word_set
def to_unicode(self, sentence):
sentence = sentence.strip()
if isinstance(sentence, str):
try:
python 切词算法(正向切割、反向切割)
最新推荐文章于 2025-05-27 11:56:30 发布