文档和域
文档是Lucene索引和搜索的原子单位,文档为包含一个或多个域的容器,而域则依次包含”真正的“被索引内容。
索引
提取文本->创建对应Document实例->通过分析将域文本处理成大量语汇单元->将语汇单元加入段结构
使用倒排索引的数据结构进行存储,能够有效的利用磁盘空间,把文档中提取出的语汇单元作为查询关键字
索引步骤
1 首先创建Directory对象用于存放索引
store=SimpleFSDirectory(File(storeDir))
2 接下来在Directory对象上创建IndexWriter对象
config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer=IndexWriter(store,config)
3 创建Document对象和Fields对象,并将Document加入索引
域
域索引选项
Field.Index.* 通过倒排索引来控制域文本是否可被索引。
Index.ANALYZED:被分析器分析,分析器提供的主要功能是将文本处理成大量语汇单元,例如文章正文等就需要被解析。
Index.NOT_ANALYZED:对当前域不进行分析,例如一些不想被改变的内容。
域存储选项
Field.Store.* 用来确定是否需要存储域的真实值, 以便后续搜索时能恢复这个值。
域的项向量选项
项向量是介于索引域和存储域的一个中间结构。
域选项组合
| 索引选项 | 存储选项 | 项向量 | 使用范例 |
|---|---|---|---|
| NOT_ANALYZED_NO_FORMS | YES | NO | 标识符,姓名,电话,日期 |
| ANALYZED | YES | WITH_POSITIONS_OFFSETS | 文档标题,摘要 |
| ANALYZED | NO | WITH_POSITIONS_OFFSETS | 文档正文 |
| NO | YES | NO | 文档类型,数据库主键 |
| NOT_ANALYZED | NO | NO | 隐藏的关键词 |
例如content的FieldType:
t2=FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
索引代码:
#!/usr/bin/env python
#coding:utf-8
INDEX_DIR = "IndexFiles.index"
import sys, os, lucene, threading, time
from datetime import datetime
from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles. It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents. The
resulting Lucene index will be placed in the current directory and called
'index'.
"""
class Ticker(object):
def __init__(self):
self.tick = True
def run(self):
while self.tick:
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(1.0)
class IndexFiles(object):
"""Usage: python IndexFiles <doc_dand will index all of the files in that directory and downward recursively.
irectory>"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(root, writer)
ticker = Ticker()
print 'commit index',
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print 'done'
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
#print filename
if not filename.endswith('.txt'):
continue
print "adding", filename
#try:
path = os.path.join(root, filename)
file = open(path)
# contents = unicode(file.read(), 'iso-8859-1')
contents = unicode(file.read(), 'utf-8')
file.close()
if len(contents) > 0:
sentences = contents.split('###')
i = 0
for sentence in sentences:
i += 1
#print i
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
doc.add(Field("sentence_id", str(i), Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("contents", sentence, t2))
writer.addDocument(doc)
else:
print "warning: no content in %s" % filename
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print 'lucene', lucene.VERSION
start = datetime.now()
#try:
base_dir = os.path.dirname(os.path.abspath('.'))
print base_dir
IndexFiles(".", os.path.join(base_dir, INDEX_DIR),
WhitespaceAnalyzer(Version.LUCENE_CURRENT))
end = datetime.now()
print end - start
搜索
#!/usr/bin/env python
#coding:utf-8
INDEX_DIR = "IndexFiles.index"
import sys, os, lucene
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import DirectoryReader, IndexReader, Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, Explanation
from org.apache.lucene.util import Version
"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles. It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field. It will then display the
'path' and 'name' fields for each of the hits it finds in the index. Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer, reader):
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
if command == '':
return
print
print "Searching for:", command
term = Term("contents", command)
print term.toString()
term_vector = reader.totalTermFreq(term)
print "%s total terms" % term_vector
query = QueryParser(Version.LUCENE_CURRENT, "contents",
analyzer).parse(command)
scoreDocs = searcher.search(query, 10000).scoreDocs
print "%s total matching documents." % len(scoreDocs)
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
explanation = searcher.explain(query, scoreDoc.doc)
#print explanation.toString()
print 'path:', doc.get("path"), 'name:', doc.get("name"), doc.get("sentence_id")
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print 'lucene', lucene.VERSION
base_dir = os.path.dirname(os.path.abspath("."))
print base_dir
directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
searcher = IndexSearcher(DirectoryReader.open(directory))
reader = IndexReader.open(directory)
analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
run(searcher, analyzer, reader)
del searcher
很好的参考文献:http://www.cppblog.com/baby-fly/archive/2010/03/08/109189.html
1736

被折叠的 条评论
为什么被折叠?



