Python 实现倒排索引

git: https://github.com/muyuxuebao/reverseIndex



# -*- coding:utf-8 -*-

import re


def token_stream(line):
    return re.findall(r'\w+', line, re.I)


def mapper(lineNum, list):
    dic = {}
    for item in list:
        key = ''.join([str(lineNum), ':', item])
        if (dic.has_key(key)):
            ll = dic.get(key)
            ll.append(1)
            dic[key] = ll
        else:
            dic[key] = [1]

    return dic


def reducer(dic):
    keys = dic.keys()
    rdic = {}
    for key in keys:
        lineNum, kk = key.split(":")
        ss = ''.join([lineNum, ':', str(dic.get(key))])
        if (rdic.has_key(kk)):
            ll = rdic[kk]
            ll.append(ss)
            rdic[kk] = ll
        else:
            rdic[kk] = [ss]

    return rdic


def combiner(dic):
    keys = dic.keys();
    tdic = {}
    for key in keys:
        valuelist = dic.get(key)
        count = 0;
        for i in valuelist:
            count += i;
        tdic[key] = count;
    return tdic


def shuffle(dic):
    dict = sorted(dic.iteritems(), key=lambda x: x[0])
    return dict


def get_reverse_index(filepath):
    file = open(filepath, 'r');
    lineNum = 0;
    rdic_p = {}
    while True:
        lineNum += 1;
        line = file.readline()
        if line != '':
            # print lineNum, ' ', line, ;
            pass
        else:
            break
        list = token_stream(line)
        mdic = mapper(lineNum, list)
        cdic = combiner(mdic)
        # print cdic
        rdic_p.update(cdic)

    rdic = reducer(rdic_p)

    # sdic = shuffle(rdic)
    return rdic


if __name__ == '__main__':
    # if __name__ == '__main1__':

    filepath = 'test.log'

    filepath = raw_input("please input filepath of file :");
    dic = get_reverse_index(filepath)

    search_word = raw_input("please input the word you want to search :");

    if (dic.has_key(search_word)):
        print dic.get(search_word)
    else:
        print -1

# if __name__ == '__main__':
if __name__ == '__main2__':
    dic = {'a': 31, 'bc': 5, 'c': 3, 'asd': 4, 'aa': 74, 'd': 0}
    dict = sorted(dic.iteritems(), key=lambda x: x[1])
    print dict

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值