Lucene学习总结-索引文件结构_struct segments-优快云博客

本文链接：https://blog.youkuaiyun.com/zhangshuliai/article/details/7974851

本文详细介绍了Lucene搜索引擎的内部结构，包括多个组成部分如段、文档字段数据、词汇信息及反向索引等，并深入剖析了每部分的数据结构及其作用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

lucene由多个段组成，每个段的文档都不相同。

segments_N:

struct segments_N{

format;//api版本号

version;//当前段的版本号

......

segment[];//所有段

};

struct segment{

segName;//段的名字

segSize;//段的大小，包含的文档数量

delGen;//删除的信息

docStoreOffset;//共享段的信息

};

.fnm:

struct fnm{

fieldCount;//域数量

field[];//域的数组

};

struct field{

fieldName;//域的名字

bitControl;//控制位

};

.fdx，.fdt：并不包含分词后的项的信息

struct fdx{

fdt*[];//和文档的数量一样，在ftd中的偏移

};
struct fdt{

docFieldData[];//和文档中的数量一样

};

struct docFieldData{

fieldCount;//域的数目

fieldData[];//和fieldCount数量一样

};

struct fieldData{

fieldNum;//域的编号

bitControl;//控制位，被索引被存储等等

fieldValue;//值，即未被分词的值

};

.tvx, .tvd. tvf：有关项的信息

struct tvx{

tvd*[];//在tvd中的偏移，和文档数量一样

tvf*[];//在tvf中的偏移和文档数量一样

};

struct tvd{

fieldInfo[];//和文档的域数量一样

};

struct fieldInfo{

fieldCount;//文档的域数量

fieldNum[];//域的编号数组

tvf*[];//域在tvf中的偏移，数量为fieldCount-1，指向tvf中的每一项fieldInfo

};

struct tvf{

docFieldInfo[];//文档的数量

};
stuct docFieldInfo{
fieldInfo[];//域的数量
}

struct fieldInfo{

termCount;//域中项的数量

bitControl;//控制位

termInfo[];//每个项的情况

};

struct termInfo{

text;//项的文本

freq;//频率

position[];//位置

offset[];//偏移

};

反向索引信息：包括词典和倒排表

.tii，.tis：词典信息，tii会全部加载到内存中，每个interval个存储一个

struct tis{

termCount;//词的数目

interval;//跳跃表信息，记录每隔多少个单词跳跃

termInfo[];//termCount个

};

struct termInfo{

prefixLength;

suffix;//两者共同组成词

fieldNum;//所属于域的编号，困惑，有可能属于多个域啊

docFreq;//出现的文档频率

frq*;//在frq中的偏移

prx*;//在prx中的偏移

};

struct tii{

intevalTermCount;//termCount/interval

tis*[];//intevalTermCount个，指向termInfo

};

struct frq{

termPost[];//termCount个

};

struct termPost{

termDoc[];//和项所在文档的数目一样，存储的是文档的差分id

skipData[];//跳跃表信息，文档数目除以跳跃的步数，存储的是文档的id。跳跃表分层次的

};

struct prx{

docPost[];//termCount个，也是跳跃表

};

struct docPost{

posts[];//单词所在文档的数目

};

struct posts{

post[];//单词在本文档中出现的各个位置，差分。

};

.del：文档删除情况

struct del{

format;

byteCount;//bits的长度

bitCount;//多少位为1

bits;//为1的位为删除的文档

dGams;//首先得到bits的对应存储，各个位交替存储byte的差分（第几个byte），本byte的值

};