简易倒排索引

最新推荐文章于 2022-04-13 17:46:42 发布

原创最新推荐文章于 2022-04-13 17:46:42 发布 · 184 阅读

0 ·

CC 4.0 BY-SA版权

Linux 同时被 3 个专栏收录

383 篇文章

订阅专栏

C++算法系列

256 篇文章

订阅专栏

设计模式

149 篇文章

订阅专栏

本文介绍了一种使用C++实现的倒排索引结构，该结构能够从多个文件中提取词汇并建立词汇到文件位置的映射，实现高效的词汇查询功能。文章详细展示了倒排索引的初始化过程，包括读取文件、解析单词和更新索引，以及如何通过查询接口获取特定单词在各文件中的出现频率和位置。

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
struct word_info {
    size_t frequency = 0;
    std::vector<size_t>pos;
};
class inverted_index_map{
public:
    using file_word_info_map = std::unordered_map<std::string, word_info>;  // key -- file path
    using index_map = std::unordered_map<std::string, file_word_info_map>;  // key -- word
public:
    inverted_index_map() = default;
    inverted_index_map(inverted_index_map &) = delete;
    inverted_index_map & operator = (inverted_index_map &) = delete;
    virtual ~inverted_index_map() = default;
public:
    void init(const std::vector<std::string>&files) {
        size_t pos = 0;
        std::string word;
        std::fstream fin;
        for (auto &filepath : files) {
            fin.open(filepath, std::ios::in);
            if (false == fin.is_open()) {
                std::cerr << filepath << " open failed." << std::endl;
                fin.close();
                continue;
            }
            while (fin >> word) {
                std::cout << "word = " << word << std::endl;
                index_map_[word][filepath].frequency++;
                index_map_[word][filepath].pos.emplace_back(pos++);
            }
            fin.close();
        }
    }
    bool query(std::string &word) {
        auto it = index_map_.find(word);
        if (it == end(index_map_)) {
            return false;
        }
        auto &file_word_info_map_ = it->second;
        for (auto &file_word_info : file_word_info_map_) {
            std::cout << file_word_info.first << std::endl;
            std::cout << file_word_info.second.frequency << std::endl;
            for (auto &pos : file_word_info.second.pos) {
                std::cout << pos << std::endl;
            }
            std::cout << file_word_info.second.frequency << std::endl;
        }
        return true;
    }
private:
    index_map index_map_;
};
int main() {
    std::vector<std::string>files{"./test.cpp"};
    inverted_index_map mm;
    mm.init(files);
    std::string word = "return";
    std::cout << mm.query(word) << std::endl;

    return 0;
}