gensim源码分析

最新推荐文章于 2024-03-19 10:37:02 发布
原创最新推荐文章于 2024-03-19 10:37:02 发布 · 725 阅读
1 ·
CC 4.0 BY-SA版权
nlp 专栏收录该内容
1 篇文章
订阅专栏
# -*- coding:utf-8 -*-
from collections import Mapping, defaultdict
import sys
import logging
import itertools
import  jieba
import  json
from gensim import utils

from six import PY3, iteritems, iterkeys, itervalues, string_types
from six.moves import xrange
from six.moves import zip as izip

if sys.version_info[0] >= 3:
    unicode = str
logger = logging.getLogger(__name__)
class Dictionary(object):
    def __init__(self, documents=None, prune_at=2000000):
        self.token2id = {}
        self.id2token = {}
        self.dfs = {}

        self.num_docs = 0
        self.num_pos = 0
        self.num_nnz = 0

        if documents is not None:
            self.add_documents(documents, prune_at=prune_at)

    def __len__(self):
        return len(self.token2id)

    def add_documents(self, documents, prune_at=2000000):
        for docno, document in enumerate(documents):
            # log progress & run a regular check for pruning, once every 10k docs
            if docno % 10000 == 0:
                if prune_at is not None and len(self) > prune_at:
                    self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at)
                logger.info("adding document #%i to %s", docno, self)

            # update Dictionary with the document
            self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids

        logger.info(
            "built %s from %i documents (total %i corpus positions)",
            self, self.num_docs, self.num_pos
        )

    def doc2bow(self, document, allow_update=False, return_missing=False):
        if isinstance(document, string_types):
            raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")

        # Construct (word, frequency) mapping.
        counter = defaultdict(int)
        for w in document:
            counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
        print (json.dumps(counter,ensure_ascii = False))
        token2id = self.token2id
        if allow_update or return_missing:
            missing = sorted(x for x in iteritems(counter) if x[0] not in token2id)
            if allow_update:
                for w, _ in missing:
                    # new id = number of ids made so far;
                    # NOTE this assumes there are no gaps in the id sequence!
                    token2id[w] = len(token2id)
        result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}

        if allow_update:
            self.num_docs += 1
            self.num_pos += sum(itervalues(counter))
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            dfs = self.dfs
            for tokenid in iterkeys(result):
                dfs[tokenid] = dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, dict(missing)
        else:
            return result


if __name__ == '__main__':
    doc0 = "我不喜欢上海"
    doc1 = "上海是一个好地方"
    doc2 = "北京是一个好地方"
    doc3 = "上海好吃的在哪里"
    doc4 = "上海好玩的在哪里"
    doc5 = "上海是好地方"
    doc6 = "上海路和上海人"
    doc7 = "喜欢小吃"
    doc_test = "我喜欢上海的小吃"
    all_doc = []
    all_doc.append(doc0)
    all_doc.append(doc1)
    all_doc.append(doc2)
    all_doc.append(doc3)
    all_doc.append(doc4)
    all_doc.append(doc5)
    all_doc.append(doc6)
    all_doc.append(doc7)
    all_doc_list = []
    for doc in all_doc:
        doc_list = [word for word in jieba.cut(doc)]
        all_doc_list.append(doc_list)
    doc_test_list = [word for word in jieba.cut(doc_test)]
    dictionary = Dictionary(all_doc_list)
    # print all_doc_list
    #print (json.dumps(dictionary.token2id, ensure_ascii=False))
    #corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
    #doc_test_vec = dictionary.doc2bow(doc_test_list)