传说中的倒排索引

本文介绍了一种简单的倒排索引实现方式,通过遍历所有关键词及其在文档中的位置信息来快速检索。支持多关键词搜索及求交集操作。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

倒排索引 这名字看起来当真很吓人,不过从原理说起来倒是很简单:

穷举所有关键词,每个关键词对应着出现过这个词的文件名,以及这个词在那个文件中出现的个数、位置等信息,这样搜索某个关键词的时候就可以很快得到对应的搜索结果;而搜索多个关键词的话也就是对解集求交集或是并集的问题了。

 

比如说,我设计的格式大概是这样:

 

Version X

keywordI {

  filenameI

    howManyOccurence placeI palceII palceIII …

  filename II

    howManyOccurence placeI palceII palceIII …

  …

}

keywordII {

  …

}

自认为这样的设计还是可以的:

首先,可读性不错;

其次,Linux的文件名太变态了,除了 / 什么字符都可以出现,这意味着 /abs../"/s+/"*/:/#/%$%$ 是一个完全合法的Linux文件名——恐怕只有用换行符来分隔文件名比较靠谱;

再次,想对整个索引记录造成不可恢复的致命损伤不是那么容易;

再次,通过查找 ^} 可以很快的定位一条记录,并不需要将整个数据库读入内存;

再次,不爽了格式可以再改,改掉Version之后的数字就行了。

 

然后写了点玩具代码,供各位喷口水:

#include
#include
#include
#include
#include
#include
#include
#include
#include
#include

#include
using namespace std;

bool verbose=false;

struct FileRecord
{
    int      occurence;
    set lines;
    FileRecord():
        occurence(0),
        lines()
    { }
    FileRecord(FileRecord const& fr):
        occurence(fr.occurence),
        lines(fr.lines)
    { }
};
struct KeywordRecord
{
    map <
        string,                 // file name
        FileRecord
    > fileRecords;
    void swap(KeywordRecord & kr) {
        this->fileRecords.swap(kr.fileRecords);
    }

    KeywordRecord() {}
    KeywordRecord(KeywordRecord const& kr) :
        fileRecords(kr.fileRecords)
    { }
};

typedef map<
            string ,            // key word
            KeywordRecord       // record
        >
        Map;

const int BUFFSIZE = 8*1024;

// My getword lib {{{
inline bool acceptAlpha(char ch)
{
    return ch>='a'&&ch<='z'||ch>='A'&&ch<='Z';
}
template
bool getword(char*& front, char* buff, T accept)
{
    if (*front==0)
        return false;
    for(bool ac=accept(*front);*front&&ac==accept(*front);++front) {
        *buff=*front;
        ++buff;
    }
    *buff=0;
    return true;
}
// }}}

inline void toLower ( char* str )
{
    for(; *str; ++str) *str |= 0x20;
}
inline void toLower ( std::string & str )
{
    for( size_t i=0; i         str[i]|=0x20;
    }
}

void scan ( string const & fname, Map & m )
{
    if ( verbose ) cerr<<"Scanning "< <<"  ";
    ifstream f(fname.c_str());
    if (!f)
        return; // throw ...
    char buf[BUFFSIZE], word[BUFFSIZE];
    for (   size_t linenum=1;
            f.getline( buf, BUFFSIZE, '/n' );
            ++linenum )
    {
        if ( f.fail() && !f.eof() ) { // too long
            --linenum;
            f.clear();
        }
        for ( char* p=buf; getword( p, word, acceptAlpha ); ) {
            if ( !acceptAlpha(word[0]) || word[1]==0 || word[2]==0 ) // no use
                continue;
            toLower(word);
            FileRecord & f=m[word].fileRecords[fname];
            ++f.occurence;
            f.lines.insert(linenum);
        }
    }
    if ( verbose ) cerr<<"...  done"<
}

void save ( string const & fname, Map const& m )
{
    if ( verbose ) cerr<<"Saving index into "< <<"  ";
    ofstream f(fname.c_str());
    if (!f)
        return; // throw ...
    f << "version 0/n/n";
    for ( Map::const_iterator citr = m.begin(); citr!=m.end(); ++citr ) {
        f << citr->first << " {/n"; // key word
        for (   map ::const_iterator
                    record = citr->second.fileRecords.begin();
                record != citr->second.fileRecords.end();
                ++record ) {
            f   << "  "  <<  record->first  << "/n    " // file name
                << record->second.occurence << " ";
            for ( set ::const_iterator line = record->second.lines.begin();
                  line!=record->second.lines.end();
                  ++line ) {
                f << *line << " ";
            }
            f   << endl;
        }
        f << "}/n/n";
    }
    if ( verbose ) cerr<<"...  done"<
}

// 玩具,没有容错功能

void load ( string const & fname, Map & m )
{
    if ( verbose ) cerr<<"Loading "< <<"  ";
    ifstream f(fname.c_str());
    if (!f)
        return; // throw ...
    string buf;
    getline(f,buf);
    // TODO: check version
    getline(f,buf); // empty line
    while(f>>buf) { // key word
        string word=buf;
        f>>buf;     // '{'
        f.get();    // '/n'
        for(;;) {
            for(;isspace(f.peek());)
                f.ignore();
            if(!getline(f, buf)) { // file name
                return;
            }
            FileRecord& record=m[word].fileRecords[buf];
            bool        eol=false;
            f>>record.occurence;   // can't be zero
            do {
                int linenum=0;
                f>>linenum;
                for( char ch; isspace(ch=f.peek()) ; f.ignore())
                    if (ch=='/n') {
                        eol=true;
                        break;
                    }
                record.lines.insert(linenum);
            } while (!eol);
            for(; isspace(f.peek()); f.ignore() )
                ;
            if (f.peek()=='}') {
                f.ignore();
                for(; isspace(f.peek()); f.ignore() )
                    ;
                break;
            }
        }
    }
    if ( verbose ) cerr<<"...  done"<
}

// 玩具型的测试程序: {{{

// 求交集——对于文件记录的交集怎么样才算合理这一点还不明确……
KeywordRecord intersection(KeywordRecord const& a, KeywordRecord const& b)
{
    KeywordRecord r;
    for (map ::const_iterator ia=a.fileRecords.begin(), ib=b.fileRecords.begin();
         ia!=a.fileRecords.end() && ib!=b.fileRecords.end();
        ) {
        int cmp = ia->first.compare(ib->first);
        if ( cmp < 0 )
            ++ia;
        else if ( cmp > 0 )
            ++ib;
        else {
            FileRecord f;
            std::set_intersection(  ia->second.lines.begin(), ia->second.lines.end(),
                                    ib->second.lines.begin(), ib->second.lines.end(),
                                    std::inserter( f.lines, f.lines.begin() ) );
            f.occurence = f.lines.size();
            r.fileRecords[ia->first]=f;
            ++ia;
            ++ib;
        }
    }
    return r;
}

// 写得太激情了,风格不好,请见谅

void isearch(Map const& m) // interactive search
{
    for(;;) {
        cout<<"Key words: ";
        string in;
        if(! getline(cin,in) )
            break;
        stringstream ss(in);
        set keywords;
        KeywordRecord ans;
        for ( ; ss>>in; keywords.insert(in) )
            toLower(in);
        Map::const_iterator idx=m.find(* keywords.begin());
        if ( idx==m.end() )
            goto none;
        ans = idx->second;
        for (   set ::const_iterator citr=++keywords.begin();
                citr!=keywords.end();
                ++citr) {
            if ( verbose ) cerr<<"Searching keyword "<<*citr<
            idx=m.find(*citr);
            if  ( idx==m.end() )
                goto none;
            KeywordRecord const& kr = idx->second;
            KeywordRecord tmp;
            tmp.swap(ans);
            ans = intersection (tmp, kr); // 相当于求 keyword[0] AND keyword[1] AND keyword[2] …
        }
        if ( ans.fileRecords.size()==0 ) {
none:
            cout<<"None found"<
            continue;
        }
        for (   map ::const_iterator i = ans.fileRecords.begin();
                i != ans.fileRecords.end();
                ++i
            ) {
            cout<<">>>> Found in: "< first<
            cout<<">>>>     Occured "< second.occurence<<" times"<
            cout<<">>>>       In Lines : ";
            for (   set ::const_iterator line = i->second.lines.begin();
                    line !=i->second.lines.end();
                    ++line )
                    cout<<*line<<" ";
            cout<<"/n/n";
        }
    }
}
// }}}

int main()
{
    verbose=true;
    Map m;
//    scan("invIndex.cpp", m);
//    scan("types.hpp", m);
//    scan("LinerEquation/gaussian.h",m);
//    scan("LinerEquation/gaussianJordan.cpp",m);
//    scan("LinerEquation/gaussianBackSubstitution.cpp",m);
//    save("invIndex.index", m);
    load("invIndex.index",m);
    isearch( m );

//    Map m2;
//    load("invIndex.index", m2);
//    save("invIndex2.index", m2);
    return 0;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值