c++实现的简易倒排索引

憧憬少

已于 2024-03-09 08:10:56 修改

阅读量3.8k

点赞数 13

分类专栏：学习笔记文章标签： c++

于 2019-04-04 22:05:08 首次发布

本文链接：https://blog.youkuaiyun.com/qq_40909410/article/details/89036261

版权

学习笔记专栏收录该内容

7 篇文章

订阅专栏

前言

智能信息检索这门课程有个上机作业，题目是“实现倒排索引”。

用到了以前没有学的STL中的vector。

个人博客本文传送门

勿抄袭代码，代码仅供参考。转载注明出处

倒排索引简介

为了从文档集（collection）中检索出想要的结果，首先要将文档集中的每个词项（term）建立索引，以确定词项所在的文档（document）的id，从而返回根据关键字查询的结果。

倒排索引的格式大概是下图这样（代码成果图）：

[外链图片转存失败(img-bzJnDRFn-1568113956032)(https://HaneChiri.github.io/blog_images/article/simple_inverted_index.png)]

每一个词项后面跟着它在文档集中出现的次数，以及出现过的文档的id所组成的一个序列。

例如第一条：

词项	词频	倒排记录表
API	6	4，5，6

就代表API这个词在文档集（六个文件）中出现了六次，这六次分布在文档4、文档5和文档6。

搜索引擎大致就是这个原理，建立好了索引之后，只需要把你搜索的关键词对应的posting求交集然后把对应的文档显示出来就可以了。

数据结构设计

文档（Document）

文档其实在这里就是文件，对于每个文档，都有一个文档名，以及相对应的文档ID，它们得绑定好，否则会混乱。因此将它们放在一个结构体里面。

struct Document
{
	string docName;//文档名
	int docID;//文档id
};

索引项（IndexItem）

同样的，每一个记录的词项、词频和记录表也是绑定的，所以也打包起来。文档id的数目不定，又不想自己写链表或者动态数组怕出错，因此采用了STL（标准模板库）里面的动态数组vector（向量容器）。

struct IndexItem
{
	string term;//词项
	int frequence;//词频
	vector<int> posting;//记录表
};

索引类（CIndex）

代码应该不难看懂。

class CIndex
{
	vector<IndexItem> indexList;//索引表
	vector<Document> collection;//文档集
public:
	CIndex();
	//利用文件名数组初始化文档集
	CIndex(string p_collection[], int n);
	//显示文档集内所有文档的文件名
	void showCollection();
	//显示当前倒排索引表
	void showIndexList();
	//索引单篇文档
	int indexDocument(FILE*fp, int docID);
	//索引文档集
	int indexCollection();
	//排序索引表
	int sortIndex();
	//索引表合并同类项
	int mergeIndex();
	~CIndex();
};

大致思路

扫描一篇文档，将这篇文档对应的文档ID加入对应词项的posting
对文档集中每一篇文档重复第一步，获取所有词项及其对应的posting加入索引表，此时每个词项的posting中只有一个文档ID，并且有很多重复的词项记录；
排序索引表；
将重复的项的posting合并，并且增加词频，删除重复项。

2019-4-4补充：想到一个新思路——直接按照ID从小到大扫描一遍整个文档集，每扫描一个词项，就在词典中查找这个词项，增加词频，然后把现在正在处理的文档的ID加入到posting，最后再排个序即可。

代码实现

有参构造函数

初始化文档集

//@name <CIndex::CIndex>
//@brief <初始化文档集>
//@param <string p_collection[]:文档文件名数组><int n:数组长度>
CIndex::CIndex(string p_collection[], int n)
{
	Document nextDoc;
	for (int i = 0; i < n; i++)
	{
		nextDoc.docName = p_collection[i];
		nextDoc.docID = i+1;//编号从1开始
		collection.push_back(nextDoc);
	}
}

索引单篇文档

大致思路是，一个个字符读取进来，如果是字母就一直读完整个单词，并把这个单词作为词项加入表中。

//@name <CIndex::indexDocument>
//@brief <索引单篇文档>
//@param <FILE * fp:已打开的文件指针><int docID:此文件的编号>
//@return <扫描到的词项数量>
int CIndex::indexDocument(FILE * fp, int docID)
{
	char ch;//扫描用的变量
	IndexItem indexItem;//打包用的变量
	int num = 0;//扫描到的词项数量

	while (!feof(fp))
	{//一次循环获取一个单词
		//找到第一个字母
		do
		{
			ch = fgetc(fp);
			if (feof(fp)) break;//防止空文件导致的无限循环
		} while (!isalpha(ch));
		if (feof(fp)) break;//防止因文件后面的空行而索引空字符串
		//读取单词，给索引项赋值
		while (isalpha(ch))
		{
			indexItem.term += ch;
			ch = fgetc(fp);
		}
		
		indexItem.frequence = 1;
		indexItem.posting.push_back(docID);//将本文件的文档ID加入posting
		//把索引项加入词典
		indexList.push_back(indexItem);
		num++;
		//清空索引项，准备下一次
		indexItem.term="";
		indexItem.posting.clear();
	}
	return num;
}

索引文档集

索引文档弄好之后，索引整个文档集不过是加个循环而已

//@name <CIndex::indexCollection>
//@brief <索引文档集>
//@return <词项总数目>
int CIndex::indexCollection()
{
	int num = 0;
	//打开对应的文件并索引
	for (int i = 0; i < collection.size(); i++)
	{
		//打开文件
		FILE* fp = fopen(collection[i].docName.c_str(), "r");
		//索引单篇文档
		num+=indexDocument(fp, collection[i].docID);
		//关闭文件
		fclose(fp);
	}
	return num;
}

排序索引表

直接使用<algorithm>头文件里面的sort()函数进行排序，自定义比较函数cmp()

bool cmp(IndexItem a, IndexItem b)
{
	return a.term<b.term;//词项按照从小到大排序
}
int CIndex::sortIndex()
{
	sort(indexList.begin(), indexList.end(), cmp);
	return 0;
}

去重

//@name <CIndex::mergeIndex>
//@brief <索引表合并同类项>

int CIndex::mergeIndex()
{
	IndexItem item1,item2;
	
	sortIndex();
	vector<IndexItem>::iterator it_cur=indexList.begin();//创建迭代器
	vector<IndexItem>::iterator it_next = it_cur + 1;
	vector<int> temp;
	vector<int>::iterator p1, p2;//用于合并posting的迭代器
	while (it_cur != indexList.end())
	{
		if(it_cur+1!=indexList.end()) it_next = it_cur + 1;
		else break;
		while((*it_cur).term == (*it_next).term)
		{//这个循环内处理掉所有与当前词项重复的词项
			
			//将二者的posting排序
			sort((*it_cur).posting.begin(), (*it_cur).posting.end());
			sort((*it_next).posting.begin(), (*it_next).posting.end());
			//有序合并两者的posting
			p1 = (*it_cur).posting.begin();
			p2 = (*it_next).posting.begin();
			while (p1 != (*it_cur).posting.end() && p2 != (*it_next).posting.end())
			{
				if ((*p1) < (*p2))//结果集中加入较小的元素
				{
					temp.push_back(*p1);
					//这个while用于跳过重复的元素
					
					p1++;
				}
				else if((*p1) > (*p2))
				{
					temp.push_back(*p2);
					
					p2++;
				}
				else
				{
					temp.push_back(*p1);
					//遇到相同的则两个都后移，避免出现重复
					p1++;
					p2++;
				}
			}
			while(p1 != (*it_cur).posting.end())//如果串1没有合并完则将串1后面部分直接复制
			{
				temp.push_back(*p1);
				p1++;
			}
			while(p2 != (*it_next).posting.end())
			{
				temp.push_back(*p2);
				p2++;
			}
			//删除结果集重复部分
			temp.erase(unique(temp.begin(), temp.end()), temp.end());

			(*it_cur).frequence++;//词频增加
			(*it_cur).posting.assign(temp.begin(), temp.end());//将结果复制
			indexList.erase(it_next);//删除重复项
			temp.clear();
			if (it_cur + 1 != indexList.end()) it_next = it_cur + 1;
			else break;
		}

		it_cur++;
	}


	/*失败代码
	for (int i = 0; i < indexList.size()-1; i++)
	{
		item1 = indexList[i];
		item2 = indexList[i + 1];
		int j = 1;//j是相对于item1的偏移量
		while (item1.term == item2.term)
		{
			vector<int> temp(item1.posting.size()+item2.posting.size());
			sort(item1.posting.begin(), item1.posting.end());
			sort(item2.posting.begin(), item2.posting.end());
			
			merge(item1.posting.begin(), item1.posting.end(), item2.posting.begin(), item2.posting.end(), temp.begin());
			indexList[i].posting.assign(temp.begin(), temp.end());
			indexList.erase(indexList.begin()+i+j);
			indexList[i].frequence++;
			
			item2 = indexList[i + j];
		}
		j = 1;
	}
	*/
	return 0;
}

一开始使用的是普通的for循环，但是发现随着元素的删除，循环次数应该改变，因此改成了迭代器加while的方式。

迭代器还是个蛮有用的东西，就是一个封装得比较好的指针。

main测试

#include <iostream>
#include <algorithm>
#include "CIndex.h"
#include <string>
using namespace std;
string fileList[6] =
{
	"doc1.txt",
	"doc2.txt",
	"doc3.txt",
	"doc4.txt",
	"doc5.txt",
	"doc6.txt"
};
int main()
{
	CIndex in(fileList,6);
	in.showCollection();
	in.indexCollection();
	in.mergeIndex();
	in.showIndexList();
	return 0;
}