文件结构
名称 | 扩展名 | 数据结构 | 说明 |
---|---|---|---|
Segments File | segments.gen segments_N | SegmentInfos | 保存当前索引中所有的段信息的集合,索引中所有可用的段信息都存储在段文件segment_N |
Lock File | write.lock | 写锁,用于阻止多个IndexWriter写同一个索引文件 | |
Segment Info | .si | Lucene70SegmentInfoFormat | segment的元数据信息,指明这个segment都包含哪些文件 |
Compound File | .cfs, .cfe | Lucene50CompoundFormat | 如果启用compound功能,会压缩索引到2个文件内 |
Fields | .fnm | Lucene60FieldInfosFormat | 存储有哪些Field,以及相关信息 |
Field Index | .fdx | Lucene50StoredFieldsFormat | Field数据文件的索引 |
Field Data | .fdt | Lucene50StoredFieldsFormat | Field数据文件 |
Term Dictionary | .tim | BlockTreeTermsWriter | Term词典 |
Term Index | .tip | BlockTreeTermsWriter | 指向Term词典的索引 |
Frequencies | .doc | Lucene50PostingsWriter | 存储有关术语在索引中出现位置的位置信息 |
Payloads | .pay | Lucene50PostingsWriter | offset偏移/payload附加信息 |
Norms | .nvd, .nvm | Lucene70NormsFormat | .nvm保存加权因子元数据;.nvd存储加权数据 |
Per-Document Values | dvd, .dvm | Lucene70DocValuesFormat | .dvm存文档正排元数据;.dvd存文档正排数据 |
Term Vector Index | .tvx | Lucene50TermVectorsFormat | 指向tvd的offset |
Term Vector Data | .tvd | Lucene50TermVectorsFormat | 存储term vector信息 |
Live Documents | .liv | Lucene50LiveDocsFormat | 存活的文档列表。位图形式 |
Point values | .dii, .dim | Lucene60PointsFormat | 多维数据,地理位置等信息,用于处理数值型的查询 |
写入数据
我们以下面代码为例看看lucene是如何写入数据的,lucene的版本是8.1.0
@Test
public void createIndexTest() throws IOException {
Document document = new Document();//文本
Document document1 = new Document();
Document document2 = new Document();
Document document3 = new Document();
//每个field域
document.add(new TextField("desc", "common common common common common term", Field.Store.YES));
document1.add(new TextField("desc", "common common common common common term term", Field.Store.YES));
document2.add(new TextField("desc", "term term term common common common common common", Field.Store.YES));
document3.add(new TextField("desc", "term", Field.Store.YES));
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
Directory directory = FSDirectory.open(Paths.get("D:/lucene/index"));
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
indexWriterConfig.setUseCompoundFile(false);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
indexWriter.addDocument(document);
indexWriter.addDocument(document1);
indexWriter.addDocument(document2);
indexWriter.addDocument(document3);
indexWriter.close();
}
创建一个数据写入目录对象,所有数据都会写入到此处设置的路径下
Directory directory = FSDirectory.open(Paths.get("D:/lucene/index"));
创建一个写入配置,并设置了标准分析器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer);
LiveIndexWriterConfig(Analyzer analyzer) {
this.analyzer = analyzer;
//内存占用最大空间
ramBufferSizeMB = IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB;
//最大文档数量
maxBufferedDocs = IndexWriterConfig.DEFAULT_MAX_BUFFERED_DOCS;
mergedSegmentWarmer = null;
//数据删除策略
delPolicy = new KeepOnlyLastCommitDeletionPolicy();
commit = null;
//默认使用复合文件
useCompoundFile = IndexWriterConfig.DEFAULT_USE_COMPOUND_FILE_SYSTEM;
//模式
openMode = OpenMode.CREATE_OR_APPEND;
//默认的打分为BM25
similarity = IndexSearcher.getDefaultSimilarity();
//段合并调度器
mergeScheduler = new ConcurrentMergeScheduler();
//索引数据处理流程,包含生成正向和倒排数据
indexingChain = DocumentsWriterPerThread.defaultIndexingChain;
//codec默认为Lucene80,可以通过SPI机制进行扩展
codec = Codec.getDefault();
if (codec == null) {
throw new NullPointerException();
}
infoStream = InfoStream.getDefault();
//设置segment的合并策略
mergePolicy = new TieredMergePolicy();
//数据刷盘策略
flushPolicy = new FlushByRamOrCountsPolicy();
readerPooling = IndexWriterConfig.DEFAULT_READER_POOLING;
//索引数据线程池
indexerThreadPool = new DocumentsWriterPerThreadPool();
perThreadHardLimitMB = IndexWriterConfig.DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB;
}
创建IndexWriter对象,IndexWriter是数据写入的核心类,这里面我们为了更好的观察lucene生成了哪些文件,禁用了lucene生成符合文件,同时我们写入了很多重复数据,主要看下lucene是如何生成倒排索引的
indexWriterConfig.setUseCompoundFile(false);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
indexWriter的构造函数
public IndexWriter(Directory d, IndexWriterConfig conf) throws IOException {
enableTestPoints = isEnableTestPoints();
conf.setIndexWriter(this); // prevent reuse by other instances
config = conf;
infoStream = config.getInfoStream();
softDeletesEnabled = config.getSoftDeletesField() != null;
//创建一个write.lock文件,防止并发写入
writeLock = d.obtainLock(WRITE_LOCK_NAME);
boolean success = false;
try {
directoryOrig = d;
directory = new LockValidatingDirectoryWrapper(d, writeLock);
analyzer = config.getAnalyzer();
mergeScheduler = config.getMergeScheduler();
mergeScheduler.setInfoStream(infoStream);
codec = config.getCodec();
OpenMode mode = config.getOpenMode();
final boolean indexExists;
final boolean create;
if (mode == OpenMode.CREATE) {
indexExists = DirectoryReader.indexExists(directory);
create = true;
} else if (mode == OpenMode.APPEND) {
indexExists = true;
create = false;
} else {
indexExists = DirectoryReader.indexExists(directory);
create = !indexExists;
}
//读取文件夹下的所有文件
String[] files = directory.listAll();
//在上面配置中我们没有设置commit,则默认为null
IndexCommit commit = config.getIndexCommit();
// Set up our initial SegmentInfos:
StandardDirectoryReader reader;
if (commit == null) {
reader = null;
} else {
reader = commit.getReader();
}
//新创建索引
if (create) {
//校验
if (config.getIndexCommit() != null) {
// We cannot both open from a commit point and create:
if (mode == OpenMode.CREATE) {
throw new IllegalArgumentException("cannot use IndexWriterConfig.setIndexCommit() with OpenMode.CREATE");
} else {
throw new IllegalArgumentException("cannot use IndexWriterConfig.setIndexCommit() when index has no commit");
}
}
final SegmentInfos sis = new SegmentInfos(config.getIndexCreatedVersionMajor());
if (indexExists) {
final SegmentInfos previous = SegmentInfos.readLatestCommit(directory);
sis.updateGenerationVersionAndCounter(previous);
}
segmentInfos = sis;
rollbackSegments = segmentInfos.createBackupSegmentInfos();
// Record that we have a change (zero out all
// segments) pending:
changed();
} else if (reader != null) {
// Init from an existing already opened NRT or non-NRT reader:
if (reader.directory() != commit.getDirectory()) {
throw new IllegalArgumentException("IndexCommit's reader must have the same directory as the IndexCommit");
}
if (reader.directory() != directoryOrig) {
throw new IllegalArgumentException("IndexCommit's reader must have the same directory passed to IndexWriter");
}
if (reader.segmentInfos.getLastGeneration() == 0) {
// TODO: maybe we could allow this? It's tricky...
throw new IllegalArgumentException("index must already have an initial commit to open from reader");
}
// Must clone because we don't want the incoming NRT reader to "see" any changes this writer now makes:
segmentInfos = reader.segmentInfos.clone();
SegmentInfos lastCommit;
try {
lastCommit = SegmentInfos.readCommit(directoryOrig, segmentInfos.getSegmentsFileName());
} catch (IOException ioe) {
throw new IllegalArgumentException("the provided reader is stale: its prior commit file \"" + segmentInfos.getSegmentsFileName() + "\" is missing from index");
}
if (reader.writer != null) {
// The old writer better be closed (we have the write lock now!):
assert reader.writer.closed;
// In case the old writer wrote further segments (which we are now dropping),
// update SIS metadata so we remain write-once:
segmentInfos.updateGenerationVersionAndCounter(reader.writer.segmentInfos);
lastCommit.updateGenerationVersionAndCounter(reader.writer.segmentInfos);
}
rollbackSegments = lastCommit.createBackupSegmentInfos();
} else {
// Init from either the latest commit point, or an explicit prior commit point:
String lastSegmentsFile = SegmentInfos.getLastCommitSegmentsFileName(files);
if (lastSegmentsFile == null) {
throw new IndexNotFoundException("no segments* file found in " + directory + ": files: " + Arrays.toString(files));
}
// Do not use SegmentInfos.read(Directory) since the spooky
// retrying it does is not necessary here (we hold the write lock):
segmentInfos = SegmentInfos.readCommit(directoryOrig, lastSegmentsFile);
if (commit != null) {
// Swap out all segments, but, keep metadata in
// SegmentInfos, like version & generation, to
// preserve write-once. This is important if
// readers are open against the future commit
// points.
if (commit.getDirectory() != directoryOrig) {
throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory, expected=" + directoryOrig + ", got=" + commit.getDirectory());
}
SegmentInfos oldInfos = SegmentInfos.readCommit(directoryOrig, commit.getSegmentsFileName());
segmentInfos.replace(oldInfos);
changed();
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "init: loaded commit \"" + commit.getSegmentsFileName() + "\"");
}
}
rollbackSegments = segmentInfos.createBackupSegmentInfos();
}
commitUserData = new HashMap<>(segmentInfos.