swish-e搜索引擎，源代码分析（6）

最新推荐文章于 2021-07-13 09:32:04 发布

moxuansheng

最新推荐文章于 2021-07-13 09:32:04 发布

阅读量749

点赞数

CC 4.0 BY-SA版权

分类专栏：搜索引擎文章标签：代码分析搜索引擎 structure compression buffer file

本文链接：https://blog.youkuaiyun.com/moxuansheng/article/details/4654660

搜索引擎专栏收录该内容

14 篇文章

订阅专栏

本文详细介绍了Swish-e搜索引擎中词条的压缩处理过程，包括遍历哈希表中的词条并进行位置信息压缩，以及如何优化存储空间。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

在前面的部分中，对于swish-e读取文件内容，将分析所得的词条加入到hash表中的过程。

当所有的词条处理完成以后，此时在sw->hashentries[VERYBIGHASHSIZE]表中则存放了所有的词条。

此时需要对于这些词条进行一定的压缩处理，才能最后写入到索引文件中。

本节开始，对于词条信息的压缩过程进行阐述。

在index.c L1200开始进行Compress the entries。

2.4 词条信息的压缩过程

2.4.1 CompressCurrentLocEntry词条压缩算法

ENTRY *ep; /* walk the hash list, and compress entries */ for (i = 0; i < VERYBIGHASHSIZE; i++) { if (idx->hashentriesdirty[i]) { idx->hashentriesdirty[i] = 0; for (ep = idx->hashentries[i]; ep; ep = ep->next) CompressCurrentLocEntry(sw, ep); } }

遍历hashentries哈希数组中的词条，通过compress_location对于位置信息进行压缩。

2.4.2 compress_location词条位置信息压缩过程

static unsigned char *compress_location(SWISH * sw, LOCATION * l) { unsigned char *p, *q; int i, max_size; /*为了节省一些空间，以及方便的存储一些信息, *通过一个字节的flag进行控制*/ unsigned char *flag; struct MOD_Index *idx = sw->Index; /* check if the work buffer is long enough */ /* just to avoid bufferoverruns */ /* In the worst case and integer will need MAXINTCOMPSIZE bytes */ /* but fortunatelly this is very uncommon */ /* 2002/01 JMRUIZ ** Added an extra byte (MAXINTCOMPSIZE+1) for each position's structure */ max_size = sizeof(unsigned char) + sizeof(LOCATION *) + (((sizeof(LOCATION) / sizeof(int) + 1) + (l->frequency - 1)) * (MAXINTCOMPSIZE + sizeof(unsigned char))); /* reallocate if needed */ if (max_size > idx->len_compression_buffer) { idx->len_compression_buffer = max_size + 200; idx->compression_buffer = erealloc(idx->compression_buffer, idx->len_compression_buffer); } /* Pointer to the buffer */ p = idx->compression_buffer; /* Add extra bytes for handling linked list */ //***JMRUIZ memcpy(p,&l->next,sizeof(LOCATION *)); /*buffer的位置向后移动，预留出一个LOCATION指针的位置, *因为LOCATION结构的第一项就为指针*/ p += sizeof(LOCATION *); /* Add the metaID */ /*将metaID进行压缩，此时指针都是向前移动的*/ p = compress3(l->metaID,p); compress_location_values(&p,&flag,l->filenum,l->frequency, l->posdata); compress_location_positions(&p,flag,l->frequency,l->posdata); /* Get the length of all the data */ i = p - idx->compression_buffer; /* Did we underrun our buffer? */ if (i > idx->len_compression_buffer) progerr("Internal error in compress_location routine"); q = (unsigned char *) Mem_ZoneAlloc(idx->currentChunkLocZone, i); memcpy(q, idx->compression_buffer, i); return (unsigned char *) q; }

在MOD_Index *idx = sw->Index idx结构中有compression_buffer变量，用于压缩的buffer；
先压缩metaID,然后压缩filenum、frequency，已经position信息。

2.4.3 compress_location_values压缩过程

void compress_location_values(unsigned char **buf,unsigned char **flagp,int filenum,int frequency, unsigned int *posdata) { unsigned char *p = *buf; unsigned char *flag; int structure = GET_STRUCTURE(posdata[0]); int common_structure = COMMON_STRUCTURE; int i; /* Make room for flag and init it */ flag = p; *flagp = p; p++; /*默认存放0x80*/ *flag = IS_FLAG; /*压缩存放filenum*/ /* Add file number */ p = compress3(filenum, p); /*如果频率为1，位置小于128，（01111111可以进行表示），结构为IN_FILE（纯文本方式），则将flag置为位置postion信息*/ /* Check for special case frequency == 1 and position[0] < 128 && structure == IN_FILE */ if(frequency == 1 && (GET_POSITION(posdata[0]) < 128) && structure == IN_FILE) { /* Remove IS_FLAG and store position in the lower 7 bits */ /* In this way we have 0bbbbbbb in *flag ** where bbbbbbb is the position and the leading 0 bit ** indicates that frequency is 1 and position is < 128 */ *flag = (unsigned char) ((int)(GET_POSITION(posdata[0]))); } /*如果不满足以上条件，则判断该词条是否都处于同一个结构中， *如果频率信息小于16，存放在flag中，如果大于，则compress压缩，如果这个LOCATION中的词条的位置都是处于同一个结构中，则将结构信息存放在flag中*/ else { /* Otherwise IS_FLAG is set */ /* Now, let's see if all positions have the same structure to ** get better compression */ for(i=1;i<frequency;i++) { if(structure != GET_STRUCTURE(posdata[i])) { common_structure = 0; break; } } if(frequency < 16) (*flag) |= frequency; /* Store freequency in flag - low 4 bits */ else p = compress3(frequency, p); /* Otherwise, leave frequency "as is" */ /* Add structure if it is equal for all positions */ if(common_structure) { switch(structure) { case IN_FILE: *flag |= COMMON_IN_FILE; break; case IN_BODY | IN_FILE: *flag |= COMMON_IN_HTML_BODY; break; default: *p++ = (unsigned char) structure; *flag |= COMMON_STRUCTURE; break; } } } *buf = p; }

通过以上的处理，对于filenum，频率都进行了压缩。