# -*- coding:utf-8 -*-
from collections import Mapping, defaultdict
import sys
import logging
import itertools
import jieba
import json
from gensim import utils
from six import PY3, iteritems, iterkeys, itervalues, string_types
from six.moves import xrange
from six.moves import zip as izip
if sys.version_info[0] >= 3:
unicode = str
logger = logging.getLogger(__name__)
class Dictionary(object):
def __init__(self, documents=None, prune_at=2000000):
self.token2id = {}
self.id2token = {}
self.dfs = {}
self.num_docs = 0
self.num_pos = 0
self.num_nnz = 0
if documents is not None:
self.add_documents(documents, prune_at=prune_at)
def __len__(self):
return len(self.token2id)
def add_documents(self, documents, prune_at=2000000):
for docno, document in enumerate(documents):
# log progress & run a regular check for pruning, once every 10k docs
if docno % 10000 == 0:
if prune_at is not None and len(self) > prune_at:
self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
logger.info("adding document #%i to %s", docno, self)
# update Dictionary with the document
self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids
logger.info(
"built %s from %i documents (total %i corpus positions)",
self, self.num_docs, self.num_pos
)
def doc2bow(self, document, allow_update=False, return_missing=False):
if isinstance(document, string_types):
raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
print (json.dumps(counter,ensure_ascii = False))
token2id = self.token2id
if allow_update or return_missing:
missing = sorted(x for x in iteritems(counter) if x[0] not in token2id)
if allow_update:
for w, _ in missing:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
if allow_update:
self.num_docs += 1
self.num_pos += sum(itervalues(counter))
self.num_nnz += len(result)
# increase document count for each unique token that appeared in the document
dfs = self.dfs
for tokenid in iterkeys(result):
dfs[tokenid] = dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
result = sorted(iteritems(result))
if return_missing:
return result, dict(missing)
else:
return result
if __name__ == '__main__':
doc0 = "我不喜欢上海"
doc1 = "上海是一个好地方"
doc2 = "北京是一个好地方"
doc3 = "上海好吃的在哪里"
doc4 = "上海好玩的在哪里"
doc5 = "上海是好地方"
doc6 = "上海路和上海人"
doc7 = "喜欢小吃"
doc_test = "我喜欢上海的小吃"
all_doc = []
all_doc.append(doc0)
all_doc.append(doc1)
all_doc.append(doc2)
all_doc.append(doc3)
all_doc.append(doc4)
all_doc.append(doc5)
all_doc.append(doc6)
all_doc.append(doc7)
all_doc_list = []
for doc in all_doc:
doc_list = [word for word in jieba.cut(doc)]
all_doc_list.append(doc_list)
doc_test_list = [word for word in jieba.cut(doc_test)]
dictionary = Dictionary(all_doc_list)
# print all_doc_list
#print (json.dumps(dictionary.token2id, ensure_ascii=False))
#corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
#doc_test_vec = dictionary.doc2bow(doc_test_list)