DefaultSkipListReader查找docId

本文深入探讨了DefaultSkipListReader的内部实现原理,包括多级跳表(MultiLevel Skip List)的设计与工作流程。详细解释了如何通过不同层级的跳表加速文档ID的查找过程,以及如何维护和更新跳表数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

DefaultSkipListReader查找docId
MultiLevelSkipListReader
public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
/**每个层的文件文件的IndexInput读取对象,是通过定位到每个文件的相对应的skiptable的位置层的位置,clone下就得到新的IndexInput 对象了
**/
this.skipStream = new IndexInput[maxSkipLevels];
/**
Skiplist每次的
**/
this.skipPointer = new long[maxSkipLevels];
//
this.childPointer = new long[maxSkipLevels];
// 当前层相对原始层跳过的元素个数
this.numSkipped = new int[maxSkipLevels];
// 跳表有多少层
this.maxNumberOfSkipLevels = maxSkipLevels;
// 每层相对于原始层跳表的间隔的元素个数
this.skipInterval = new int[maxSkipLevels];

this.skipStream [0]= skipStream;
this.inputIsBuffered = (skipStream instanceof BufferedIndexInput);
this.skipInterval[0] = skipInterval;
// 由于skipInterval 是已知的,所以每层的间隔就可以计算出来
for (int i = 1; i < maxSkipLevels; i++) {
// cache skip intervals
this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval;
}
//记录当前层的docId的
skipDoc = new int[maxSkipLevels];
}


skipTo(int target)
扫描skiplist返回不小于某个docId的前面的那个skipdata所比较的doc的数目
int skipTo(int target) throws IOException {
if (!haveSkipped) {
// first time, load skip levels
loadSkipLevels();
haveSkipped = true;
}

// skipDoc 记录是当前level遍历到的docId,从最低层向最高层比较,直到找到targt大//于某个level的docId
// walk up the levels until highest level is found that has a skip
// for this target
int level = 0;
while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) {
level++;
}
// 查找
while (level >= 0) {
if (target > skipDoc[level]) {// 如果target大于level上的docId,读取//下一个skiplist实体skipdata,直到找到大于这个target的docId
if (!loadNextSkip(level)) {
continue;
}
} else {
// no more skips on this level, go down one level
if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) {
seekChild(level - 1);
}
level--;
}
}
//
return numSkipped[0] - skipInterval[0] - 1;
}


loadSkipLevels()
加载level信息,
/** Loads the skip levels */
private void loadSkipLevels() throws IOException {

/**由包含这个term的document的数目计算skiptable的层数,如果超过maxNumberOfSkipLevels则为maxNumberOfSkipLevels**/

numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0]));
if (numberOfSkipLevels > maxNumberOfSkipLevels) {
numberOfSkipLevels = maxNumberOfSkipLevels;
}

//Seek到skipPointer[0]的位置,也就是也就是,在frg文件里面skipdata起始位置如果图2

skipStream[0].seek(skipPointer[0]);

/** 标识读取到内存中的skiptable中level的数目**/
int toBuffer = numberOfLevelsToBuffer;
//
for (int i = numberOfSkipLevels - 1; i > 0; i--) {
// skiptable的层的长度,如后面图1标示
// the length of the current level
long length = skipStream[0].readVLong();
// 当前层的起始偏移量
// the start pointer of the current level
skipPointer[i] = skipStream[0].getFilePointer();
if (toBuffer > 0) {
// 将文件数据读入到内存,定位到下一个level的起始位置
// buffer this level
skipStream[i] = new SkipBuffer(skipStream[0], (int) length);
toBuffer--;
} else {
// 克隆这个IndexInput,为了每个level的读取
// clone this stream, it is already at the start of the current level
skipStream[i] = (IndexInput) skipStream[0].clone();
if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) {
((BufferedIndexInput) skipStream[i]).setBufferSize((int) length);
}
//定位到下一个level的起始位置
// move base stream beyond the current level
skipStream[0].seek(skipStream[0].getFilePointer() + length);
}
}

// use base stream for the lowest level
skipPointer[0] = skipStream[0].getFilePointer();
}


loadNextSkip
private boolean loadNextSkip(int level) throws IOException {
/**
设置最后访问层的docId和下个节点的位置
**/
// we have to skip, the target document is greater than the current
// skip list entry
setLastSkipData(level);
// 记录跳过的元素的个数,例如跳表的间隔为16,则第0层的第一个元素相对于原数据,跳过了16个元素,第1层相对于第0层跳过了16 个元素,第1层相对于原始层跳过了16*16 个元素,这个地方记录的是相对原始层跳过的元素的个数
numSkipped[level] += skipInterval[level];
// 判断某层跳过的document的数目是否大于最大文档数目
if (numSkipped[level] > docCount) {
// this skip list is exhausted
skipDoc[level] = Integer.MAX_VALUE;
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
return false;
}
// 读取跳表中实体的值,返回的docId和前面一个实体的docId的差值,所以正确的值应//该是,返回值加上前面的差值skipDoc[],这个数组记录的是当前level的移动到的实体的//docId
// read next skip entry
skipDoc[level] += readSkipData(level, skipStream[level]);

if (level != 0) {
// 计算下一个level的起始位置,也就是本层的skipdata在下个层的位置
// read the child pointer if we are not on the leaf level
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
}

return true;

}


readSkipData
//读取一个跳表中的实体
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
int delta;// docId 运营delDa规则存储的
if (currentFieldStoresPayloads) {
// the current field stores payloads.
// if the doc delta is odd then we have
// to read the current payload length
// because it differs from the length of the
// previous payload
delta = skipStream.readVInt();
if ((delta & 1) != 0) {
payloadLength[level] = skipStream.readVInt();
}
delta >>>= 1;
} else {
delta = skipStream.readVInt();
}
//文档号对应的倒排表中的节点在frq
//中的偏移量,文档号对应的倒排表中的节点在prx中的偏移量。
freqPointer[level] += skipStream.readVInt();
proxPointer[level] += skipStream.readVInt();

return delta;
}

next()
public boolean next() throws IOException {
while (true) {
if (count == df)
return false;
// 读取下一个docId
final int docCode = freqStream.readVInt();

if (currentFieldOmitTermFreqAndPositions) {
doc += docCode;
freq = 1;
} else {
// 由于使用了DocDelta[, Freq?],规则,所以读到的docId,向左移一位得到和前面skipdata的docId的差值,加上前面的docId的值就是实际的docId的值,由于frg等于1,则docCode 的最后一位是 1,说明frg等于1,不用往后读取frg的值了。

doc += docCode >>> 1; // shift off low bit
if ((docCode & 1) != 0) // if low bit is set
freq = 1; // freq is one
else
freq = freqStream.readVInt(); // else read freq
}

count++;

// 查看docId是否在删除的文档里面
if (deletedDocs == null || !deletedDocs.get(doc))
break;
skippingDoc();
}
return true;
}


图1



图2
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值