
将常用的 一些 doc txt html文档 索引,关键是 lucene 只需要 将 doc html 剥离成 普通的 还有有效信息的字符串即可。
仿照demo建立 DOCDocument.java ,getText() 利用 POI ,抽取 基类 FileDocument,
动态载入 后缀名 + Document .class ,
基类FileDocument:
package bts.jsp.kbase;
import java.io.*;
import java.util.Map;
import java.util.HashMap;
import java.util.Arrays;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
public abstract class FileDocument {
static Map<String, FileDocument> DocmentMap;
static {
try {
DocmentMap = init();
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static Map<String, FileDocument> init() throws Exception {
Map<String, FileDocument> map = new HashMap<String, FileDocument>();
for (String t : KbaseConfig.TYPES) {
map.put(t, (FileDocument) Class.forName("bts.jsp.kbase." + t.toUpperCase() + "Document").newInstance());
}
return map;
}
/*
public static String getCommonContent(String path) {
if (!KbaseConfig.accceptFile(path))
return null;
String subtype = path.substring(path.lastIndexOf(".") + 1);
String content = DocmentMap.get(subtype.toLowerCase()).getTextContent(path);
return content;
} */
public static String getCacheStringContent(String path) {
String stringPath = KbaseConfig.getCacheStringPath(path);
System.out.println("get cache :" + stringPath);
StringBuffer sb = new StringBuffer("");
try {
BufferedReader reader = new BufferedReader(new FileReader(stringPath));
String line;
while ((line = reader.readLine()) != null) {
sb.append(line).append("\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
return sb.toString();
}
private String cacheAndGetStringContent(String path) {
String content = this.getTextContent(path);
String stringPath = KbaseConfig.getCacheStringPath(path);
System.out.println(stringPath);
{
String dir = stringPath.substring(0, stringPath.lastIndexOf("/"));
File f = new File(dir);
if (!f.exists()) f.mkdirs();
}
try {
PrintWriter pw = new PrintWriter(stringPath);
pw.println(content);
pw.close();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
public static Document getCommonDocument(String path) {
if (!KbaseConfig.accceptFile(path))
return null;
String subtype = path.substring(path.lastIndexOf(".") + 1);
try {
return DocmentMap.get(subtype).Document(new File(path));
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public abstract String getTextContent(String path);
public Document Document(File f)
throws java.io.FileNotFoundException {
// make a new, empty document
Document doc = new Document();
// Add the path of the file as a field named "path". Use a field that is
// indexed (i.e. searchable), but don't tokenize the field into words.
doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("title", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the last modified date of the file a field named "modified". Use
// a field that is indexed (i.e. searchable), but don't tokenize the field
// into words.
doc.add(new Field("modified",
f.lastModified() + "",
Field.Store.YES, Field.Index.NOT_ANALYZED));
// Add the contents of the file to a field named "contents". Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// If that's not the case searching for special characters will fail.
String content = cacheAndGetStringContent(f.getPath());
doc.add(new Field("contents", content, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
// return the document
return doc;
}
}
举例,对于doc 后缀名 ,有类 DOCDocument
package bts.jsp.kbase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.poi.hwpf.extractor.WordExtractor;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.File;
import java.io.FileInputStream;
/**
* Created by IntelliJ IDEA.
* User: yiminghe
* Date: 2008-12-11
* Time: 15:32:09
* To change this template use File | Settings | File Templates.
*/
public class DOCDocument extends FileDocument {
/**
* Makes a document for a File.
* <p/>
* The document has three fields:
* <ul>
* <li><code>path</code>--containing the pathname of the file, as a stored,
* untokenized field;
* <li><code>modified</code>--containing the last modified date of the file as
* a field as created by <a
* href="lucene.document.DateTools.html">DateTools</a>; and
* <li><code>contents</code>--containing the full contents of the file, as a
* Reader field;
*/
public String getTextContent(String path) {
String content = "";
try {
WordExtractor wordExtractor = new WordExtractor(new FileInputStream(path));
content = wordExtractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
}
接着要考虑的问题是 ,如何避免重复索引 以及 文件修改后重新索引 ,这就要用到 lastmodifiy 这个文件的属性,每次索引完 都将 已索引的文件以及其最后修改时间保存下来, 下次索引前先检查 , 更新或新增时 才真正索引。
也就是 每次都找出 应该删掉哪些,应该加入哪些 (索引的更新是通过先删除再加入实现的)
package bts.jsp.kbase;
import bts.roi.BtsManager;
import java.io.*;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: yiminghe
* Date: 2008-12-11
* Time: 17:29:45
* To change this template use File | Settings | File Templates.
*/
public class KbaseConfig {
// 已经建立索引的文件集合
static final String INDEXEDFILES = BtsManager.getProperty("Bts.INDEXEDFILES");
//索引存放目录
static final File INDEX_DIR = new File(BtsManager.getProperty("Bts.INDEX_DIR"));
//真实数据
static final String DATA_DIR = BtsManager.getProperty("Bts.DATA_DIR");
//真实数据 String
static final String DATA_STRING_DIR = BtsManager.getProperty("Bts.DATA_STRING_DIR");
//索引后缀名列表
static String[] TYPES = {"html", "htm", "txt", "doc", "ppt", "xls", "pdf"};
static {
File f = new File(DATA_STRING_DIR);
if (!f.exists()) f.mkdirs();
}
public static void saveIndexedFiles(ArrayList<String[]> data) {
try {
PrintWriter pw = new PrintWriter(INDEXEDFILES);
for (int i = 0; i < data.size(); i++) {
String[] d = data.get(i);
for (int j = 0; j < d.length; j++) {
pw.print(d[j] + "\t");
}
pw.println();
}
pw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
static String getCacheStringPath(String path) {
path = path.replaceAll("\\\\", "/");
String stringPath = path.replaceAll(KbaseConfig.DATA_DIR, KbaseConfig.DATA_STRING_DIR);
return stringPath;
}
public static ArrayList<String[]> loadIndexedFiles() {
ArrayList<String[]> data = new ArrayList<String[]>();
if (new File(INDEXEDFILES).exists()) {
try {
BufferedReader reader = new BufferedReader(new FileReader(INDEXEDFILES));
String line;
while ((line = reader.readLine()) != null) {
if ((line = line.trim()).equals("")) continue;
String[] d = line.split("\t");
data.add(d);
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return data;
}
public static ArrayList<String[]> getCurrentFiles(String dir) {
ArrayList<String[]> d = new ArrayList<String[]>();
getCurrentFiles(dir, d);
return d;
}
private static int indexArray(String[] array, String value) {
value = value.trim();
for (int i = 0; i < array.length; i++) {
if (array[i].equals(value))
return i;
}
return -1;
}
static boolean accceptFile(String path) {
int index = path.lastIndexOf(".");
if (index == -1) return false;
String subtype = path.substring(index + 1);
int array_index = indexArray(TYPES, subtype.toLowerCase());
if (array_index == -1) return false;
return true;
}
private static void getCurrentFiles(String dir, ArrayList<String[]> data) {
File f = new File(dir);
if (f.isDirectory()) {
File[] fs = f.listFiles(new FileFilter() {
public boolean accept(File pathname) {
boolean ac = pathname.isDirectory() || accceptFile(pathname.getAbsolutePath());
return ac;
}
});
for (int i = 0; i < fs.length; i++) {
getCurrentFiles(fs[i].getAbsolutePath(), data);
}
return;
}
if (!f.canRead()) return;
String[] d = new String[2];
d[0] = f.getAbsolutePath();
d[1] = f.lastModified() + "";
data.add(d);
}
public static ArrayList<String> getDeleted(ArrayList<String[]> original, ArrayList<String[]> newData) {
ArrayList<String> result = new ArrayList<String>();
for (int i = 0; i < original.size(); i++) {
String path = original.get(i)[0];
long lm = Long.parseLong(original.get(i)[1]);
boolean modified = false;
int j = 0;
for (j = 0; j < newData.size(); j++) {
String path2 = newData.get(j)[0];
long lm2 = Long.parseLong(newData.get(j)[1]);
if (path2.equals(path)) {
if (lm2 > lm) {
modified = true;
break;
} else {
break;
}
}
}
//修改或者已经被删除
if (modified || j == newData.size()) {
result.add(path);
}
}
return result;
}
public static ArrayList<String> getAdded(ArrayList<String[]> original, ArrayList<String[]> newData) {
ArrayList<String> result = new ArrayList<String>();
for (int i = 0; i < newData.size(); i++) {
String path = newData.get(i)[0];
long lm = Long.parseLong(newData.get(i)[1]);
boolean modified = false;
int j = 0;
for (j = 0; j < original.size(); j++) {
String path2 = original.get(j)[0];
long lm2 = Long.parseLong(original.get(j)[1]);
if (path2.equals(path)) {
if (lm > lm2) {
modified = true;
break;
} else {
break;
}
}
}
//修改或者已经新的
if (modified || j == original.size()) {
result.add(path);
}
}
return result;
}
}
其他的 查询 ,删除 都和 demo 差不多了 ,加了 highlighter 的应用 ,
package bts.jsp.kbase;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import java.util.Date;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocCollector;
import org.apache.lucene.search.highlight.*;
/**
* Simple command-line based search demo.
*/
public class SearchFiles {
/**
* Use the norms from one field for all fields. Norms are read into memory,
* using a byte of memory per document per searched field. This can cause
* search of large collections with a large number of fields to run out of
* memory. If all of the fields contain only a single token, then the norms
* are all identical, then single norm vector may be shared.
*/
private static class OneNormsReader extends FilterIndexReader {
private String field;
public OneNormsReader(IndexReader in, String field) {
super(in);
this.field = field;
}
public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}
private SearchFiles() {
}
/**
* Simple command-line based search demo.
*/
public static KbaseFiles search(String field, String queries, int start, int limit) throws Exception {
IndexReader reader = IndexReader.open(KbaseConfig.INDEX_DIR);
/*
if (normsField != null)
reader = new OneNormsReader(reader, normsField);
*/
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser parser = new QueryParser(field, analyzer);
Query query = parser.parse(queries);
//System.out.println("Searching for: " + query.toString(field));
KbaseFiles files = null;
if (start >= 0) {
files = doPagingSearch(analyzer, searcher, query, start, limit);
} else {
doStreamingSearch(searcher, query);
}
return files;
}
/**
* This method uses a custom HitCollector implementation which simply prints out
* the docId and score of every matching document.
* <p/>
* This simulates the streaming search use case, where all hits are supposed to
* be processed, regardless of their relevance.
*/
public static void doStreamingSearch(final IndexSearcher searcher, Query query) throws IOException {
HitCollector streamingHitCollector = new HitCollector() {
// simply print docId and score of every matching document
public void collect(int doc, float score) {
//System.out.println("doc=" + doc + " score=" + score);
}
};
searcher.search(query, streamingHitCollector);
}
/**
* This demonstrates a typical paging search scenario, where the search engine presents
* pages of size n to the user. The user can then go to the next page if interested in
* the next hits.
* <p/>
* When the query is executed for the first time, then only enough results are collected
* to fill 5 result pages. If the user wants to page beyond this limit, then the query
* is executed another time and all hits are collected.
*/
public static KbaseFiles doPagingSearch(Analyzer analyzer, IndexSearcher searcher, Query query,
int start, int limit) throws IOException {
// Collect enough docs to show 5 pages
TopDocCollector collector = new TopDocCollector(start + limit);
searcher.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
int numTotalHits = collector.getTotalHits();
//System.out.println(numTotalHits + " total matching documents");
int end = Math.min(numTotalHits, start + limit);
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<em>", "</em>");
Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
KbaseFiles fileResult = new KbaseFiles();
fileResult.setTotal(numTotalHits);
ArrayList<KbaseFile> files = new ArrayList<KbaseFile>();
for (int i = start; i < end; i++) {
//if (raw) { // output raw format
//System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score);
//continue;
//}
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
if (path != null) {
//System.out.println((i + 1) + ". " + path);
String title = doc.get("title");
String contents = FileDocument.getCommonContent(path);
String highLightText = highlighter.getBestFragment(analyzer, "contents", contents);
String modified = doc.get("modified");
modified = modified.substring(0, modified.length() - 3);
KbaseFile file = new KbaseFile(title, path, modified, highLightText);
files.add(file);
} else {
//System.out.println((i + 1) + ". " + "No path for this document");
}
}
fileResult.setFiles(files);
return fileResult;
}
}
本文介绍了一个基于Lucene的文档索引系统实现方案,包括不同文件类型的解析、索引更新策略及高亮显示等功能。
826

被折叠的 条评论
为什么被折叠?



