编写正排索引
继续编写incde.hpp
#pragma once
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include "util.hpp"
namespace ns_index{
struct DocInfo{
std::string title; //文档标题
std::string content; //文档对应的去标签之后的内容
std::string url; //官网文档url
uint64_t dic_id; //文档的ID
}
struct InvertedElem{
uint64_t doc_id;
std::string word;
int weight;
}
//倒排拉链
typedef std::vector<InvertedElem> InvertedList;
class Index{
private:
//正排索引的数据结构用数组,数组的下标天然是文档的ID
std::vector<DocInfo> forward_index; //正排索引
//倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
std::unordered_map<std::string, InvertedList> inverted_index;
public:
Index(){}
~Index(){}
public:
//根据doc_id找到文档内容
DocInfo *GetForwardIdex(uint64_t doc_id)
{
if(doc_id >= forward_index.size()){
std::cerr << "doc_id out range, error" << std::endl;
return nullptr;
}
return &forward_index[doc_id];
}
//根据关键字string获得倒排拉链
InvertedList *GetInvertedList(const std::string &word)
{
auto iter = inverted_index.find(word);
if(iter == inverted_index.end()){
std::cerr << word << " have no InvertedList" << std::endl;
return nullptr;
}
return &(iter->second);
}
//根据去标签,格式化之后的文档,构建正排和倒排索引
//data/raw_html/raw.txt
bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
{
std::ifstream in(input, std::ios::in | std::ios::binary);
if(!in.is_open()){
std::cerr << "sorry, " << input << " open error" << std::endl;
return false;
}
std::string line;
while(std::getline(in, line)){
DocInfo * doc = BuildForwardIndex(line);
if(nullptr == doc){
std::cerr << "build " << line << " error" << std::endl; //for debug
continue;
}
BuildInvertedIndex(*doc);
}
return true;
}
private:
DocInfo *BuildForwardIndex(const std

最低0.47元/天 解锁文章
994

被折叠的 条评论
为什么被折叠?



