PyLucene学习之二

最新推荐文章于 2022-03-02 17:35:26 发布

原创最新推荐文章于 2022-03-02 17:35:26 发布 · 1.4k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#lucene

数据分析专栏收录该内容

6 篇文章

订阅专栏

文档和域

文档是Lucene索引和搜索的原子单位，文档为包含一个或多个域的容器，而域则依次包含”真正的“被索引内容。

索引

提取文本->创建对应Document实例->通过分析将域文本处理成大量语汇单元->将语汇单元加入段结构
使用倒排索引的数据结构进行存储，能够有效的利用磁盘空间，把文档中提取出的语汇单元作为查询关键字

索引步骤

1 首先创建Directory对象用于存放索引

store=SimpleFSDirectory(File(storeDir))

2 接下来在Directory对象上创建IndexWriter对象

config=IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer=IndexWriter(store,config)

3 创建Document对象和Fields对象，并将Document加入索引

域

域索引选项

Field.Index.* 通过倒排索引来控制域文本是否可被索引。
Index.ANALYZED：被分析器分析，分析器提供的主要功能是将文本处理成大量语汇单元，例如文章正文等就需要被解析。
Index.NOT_ANALYZED：对当前域不进行分析，例如一些不想被改变的内容。

域存储选项

Field.Store.* 用来确定是否需要存储域的真实值，以便后续搜索时能恢复这个值。

域的项向量选项

项向量是介于索引域和存储域的一个中间结构。

域选项组合

索引选项	存储选项	项向量	使用范例
NOT_ANALYZED_NO_FORMS	YES	NO	标识符，姓名，电话，日期
ANALYZED	YES	WITH_POSITIONS_OFFSETS	文档标题，摘要
ANALYZED	NO	WITH_POSITIONS_OFFSETS	文档正文
NO	YES	NO	文档类型，数据库主键
NOT_ANALYZED	NO	NO	隐藏的关键词

例如content的FieldType：

t2=FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

索引代码：

#!/usr/bin/env python
#coding:utf-8

INDEX_DIR = "IndexFiles.index"

import sys, os, lucene, threading, time
from datetime import datetime

from java.io import File
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
"""
This class is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.IndexFiles.  It will take a directory as an argument
and will index all of the files in that directory and downward recursively.
It will index on the file path, the file name and the file contents.  The
resulting Lucene index will be placed in the current directory and called
'index'.
"""

class Ticker(object):

    def __init__(self):
        self.tick = True

    def run(self):
        while self.tick:
            sys.stdout.write('.')
            sys.stdout.flush()
            time.sleep(1.0)

class IndexFiles(object):
    """Usage: python IndexFiles <doc_dand will index all of the files in that directory and downward recursively.
irectory>"""

    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.indexDocs(root, writer)
        ticker = Ticker()
        print 'commit index',
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print 'done'

    def indexDocs(self, root, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                #print filename
                if not filename.endswith('.txt'):
                    continue
                print "adding", filename
                #try:
                path = os.path.join(root, filename)
                file = open(path)
                # contents = unicode(file.read(), 'iso-8859-1')
                contents = unicode(file.read(), 'utf-8')
                file.close()
                if len(contents) > 0:
                    sentences = contents.split('###')
                    i = 0
                    for sentence in sentences:
                        i += 1
                        #print i
                        doc = Document()
                        doc.add(Field("name", filename, t1))
                        doc.add(Field("path", root, t1))
                        doc.add(Field("sentence_id", str(i), Field.Store.YES, Field.Index.NOT_ANALYZED))
                        doc.add(Field("contents", sentence, t2))
                        writer.addDocument(doc)
                else:
                    print "warning: no content in %s" % filename

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    start = datetime.now()
    #try:
    base_dir = os.path.dirname(os.path.abspath('.'))
    print base_dir
    IndexFiles(".", os.path.join(base_dir, INDEX_DIR),
               WhitespaceAnalyzer(Version.LUCENE_CURRENT))
    end = datetime.now()
    print end - start

搜索

#!/usr/bin/env python
#coding:utf-8

INDEX_DIR = "IndexFiles.index"

import sys, os, lucene

from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import DirectoryReader, IndexReader, Term
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher, Explanation
from org.apache.lucene.util import Version

"""
This script is loosely based on the Lucene (java implementation) demo class
org.apache.lucene.demo.SearchFiles.  It will prompt for a search query, then it
will search the Lucene index in the current directory called 'index' for the
search query entered against the 'contents' field.  It will then display the
'path' and 'name' fields for each of the hits it finds in the index.  Note that
search.close() is currently commented out because it causes a stack overflow in
some cases.
"""
def run(searcher, analyzer, reader):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command

        term = Term("contents", command)
        print term.toString()
        term_vector = reader.totalTermFreq(term)
        print "%s total terms" % term_vector

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 10000).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            explanation = searcher.explain(query, scoreDoc.doc)
            #print explanation.toString()
            print 'path:', doc.get("path"), 'name:', doc.get("name"), doc.get("sentence_id")


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath("."))
    print base_dir
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = IndexReader.open(directory)
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    run(searcher, analyzer, reader)
    del searcher