倒排索引 这名字看起来当真很吓人,不过从原理说起来倒是很简单:
穷举所有关键词,每个关键词对应着出现过这个词的文件名,以及这个词在那个文件中出现的个数、位置等信息,这样搜索某个关键词的时候就可以很快得到对应的搜索结果;而搜索多个关键词的话也就是对解集求交集或是并集的问题了。
比如说,我设计的格式大概是这样:
Version X
keywordI {
filenameI
howManyOccurence placeI palceII palceIII …
filename II
howManyOccurence placeI palceII palceIII …
…
}
keywordII {
…
}
…
自认为这样的设计还是可以的:
首先,可读性不错;
其次,Linux的文件名太变态了,除了 / 什么字符都可以出现,这意味着 /abs../"/s+/"*/:/#/%$%$ 是一个完全合法的Linux文件名——恐怕只有用换行符来分隔文件名比较靠谱;
再次,想对整个索引记录造成不可恢复的致命损伤不是那么容易;
再次,通过查找 ^} 可以很快的定位一条记录,并不需要将整个数据库读入内存;
再次,不爽了格式可以再改,改掉Version之后的数字就行了。
然后写了点玩具代码,供各位喷口水:
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
bool verbose=false;
struct FileRecord
{
int occurence;
set
lines;
FileRecord():
occurence(0),
lines()
{ }
FileRecord(FileRecord const& fr):
occurence(fr.occurence),
lines(fr.lines)
{ }
};
struct KeywordRecord
{
map <
string, // file name
FileRecord
> fileRecords;
void swap(KeywordRecord & kr) {
this->fileRecords.swap(kr.fileRecords);
}
KeywordRecord() {}
KeywordRecord(KeywordRecord const& kr) :
fileRecords(kr.fileRecords)
{ }
};
typedef map<
string , // key word
KeywordRecord // record
>
Map;
const int BUFFSIZE = 8*1024;
// My getword lib {{{
inline bool acceptAlpha(char ch)
{
return ch>='a'&&ch<='z'||ch>='A'&&ch<='Z';
}
template
bool getword(char*& front, char* buff, T accept)
{
if (*front==0)
return false;
for(bool ac=accept(*front);*front&&ac==accept(*front);++front) {
*buff=*front;
++buff;
}
*buff=0;
return true;
}
// }}}
inline void toLower ( char* str )
{
for(; *str; ++str) *str |= 0x20;
}
inline void toLower ( std::string & str )
{
for( size_t i=0; i
str[i]|=0x20;
}
}
void scan ( string const & fname, Map & m )
{
if ( verbose ) cerr<<"Scanning "<
<<" ";
ifstream f(fname.c_str());
if (!f)
return; // throw ...
char buf[BUFFSIZE], word[BUFFSIZE];
for ( size_t linenum=1;
f.getline( buf, BUFFSIZE, '/n' );
++linenum )
{
if ( f.fail() && !f.eof() ) { // too long
--linenum;
f.clear();
}
for ( char* p=buf; getword( p, word, acceptAlpha ); ) {
if ( !acceptAlpha(word[0]) || word[1]==0 || word[2]==0 ) // no use
continue;
toLower(word);
FileRecord & f=m[word].fileRecords[fname];
++f.occurence;
f.lines.insert(linenum);
}
}
if ( verbose ) cerr<<"... done"<
}
void save ( string const & fname, Map const& m )
{
if ( verbose ) cerr<<"Saving index into "<
<<" ";
ofstream f(fname.c_str());
if (!f)
return; // throw ...
f << "version 0/n/n";
for ( Map::const_iterator citr = m.begin(); citr!=m.end(); ++citr ) {
f << citr->first << " {/n"; // key word
for ( map
::const_iterator
record = citr->second.fileRecords.begin();
record != citr->second.fileRecords.end();
++record ) {
f << " " << record->first << "/n " // file name
<< record->second.occurence << " ";
for ( set
::const_iterator line = record->second.lines.begin();
line!=record->second.lines.end();
++line ) {
f << *line << " ";
}
f << endl;
}
f << "}/n/n";
}
if ( verbose ) cerr<<"... done"<
}
// 玩具,没有容错功能
void load ( string const & fname, Map & m )
{
if ( verbose ) cerr<<"Loading "<
<<" ";
ifstream f(fname.c_str());
if (!f)
return; // throw ...
string buf;
getline(f,buf);
// TODO: check version
getline(f,buf); // empty line
while(f>>buf) { // key word
string word=buf;
f>>buf; // '{'
f.get(); // '/n'
for(;;) {
for(;isspace(f.peek());)
f.ignore();
if(!getline(f, buf)) { // file name
return;
}
FileRecord& record=m[word].fileRecords[buf];
bool eol=false;
f>>record.occurence; // can't be zero
do {
int linenum=0;
f>>linenum;
for( char ch; isspace(ch=f.peek()) ; f.ignore())
if (ch=='/n') {
eol=true;
break;
}
record.lines.insert(linenum);
} while (!eol);
for(; isspace(f.peek()); f.ignore() )
;
if (f.peek()=='}') {
f.ignore();
for(; isspace(f.peek()); f.ignore() )
;
break;
}
}
}
if ( verbose ) cerr<<"... done"<
}
// 玩具型的测试程序: {{{
// 求交集——对于文件记录的交集怎么样才算合理这一点还不明确……
KeywordRecord intersection(KeywordRecord const& a, KeywordRecord const& b)
{
KeywordRecord r;
for (map
::const_iterator ia=a.fileRecords.begin(), ib=b.fileRecords.begin();
ia!=a.fileRecords.end() && ib!=b.fileRecords.end();
) {
int cmp = ia->first.compare(ib->first);
if ( cmp < 0 )
++ia;
else if ( cmp > 0 )
++ib;
else {
FileRecord f;
std::set_intersection( ia->second.lines.begin(), ia->second.lines.end(),
ib->second.lines.begin(), ib->second.lines.end(),
std::inserter( f.lines, f.lines.begin() ) );
f.occurence = f.lines.size();
r.fileRecords[ia->first]=f;
++ia;
++ib;
}
}
return r;
}
// 写得太激情了,风格不好,请见谅
void isearch(Map const& m) // interactive search
{
for(;;) {
cout<<"Key words: ";
string in;
if(! getline(cin,in) )
break;
stringstream ss(in);
set
keywords;
KeywordRecord ans;
for ( ; ss>>in; keywords.insert(in) )
toLower(in);
Map::const_iterator idx=m.find(* keywords.begin());
if ( idx==m.end() )
goto none;
ans = idx->second;
for ( set
::const_iterator citr=++keywords.begin();
citr!=keywords.end();
++citr) {
if ( verbose ) cerr<<"Searching keyword "<<*citr<
idx=m.find(*citr);
if ( idx==m.end() )
goto none;
KeywordRecord const& kr = idx->second;
KeywordRecord tmp;
tmp.swap(ans);
ans = intersection (tmp, kr); // 相当于求 keyword[0] AND keyword[1] AND keyword[2] …
}
if ( ans.fileRecords.size()==0 ) {
none:
cout<<"None found"<
continue;
}
for ( map
::const_iterator i = ans.fileRecords.begin();
i != ans.fileRecords.end();
++i
) {
cout<<">>>> Found in: "<
first<
cout<<">>>> Occured "<
second.occurence<<" times"<
cout<<">>>> In Lines : ";
for ( set
::const_iterator line = i->second.lines.begin();
line !=i->second.lines.end();
++line )
cout<<*line<<" ";
cout<<"/n/n";
}
}
}
// }}}
int main()
{
verbose=true;
Map m;
// scan("invIndex.cpp", m);
// scan("types.hpp", m);
// scan("LinerEquation/gaussian.h",m);
// scan("LinerEquation/gaussianJordan.cpp",m);
// scan("LinerEquation/gaussianBackSubstitution.cpp",m);
// save("invIndex.index", m);
load("invIndex.index",m);
isearch( m );
// Map m2;
// load("invIndex.index", m2);
// save("invIndex2.index", m2);
return 0;
}