转自 http://blog.youkuaiyun.com/caoxu1987728/archive/2008/07/18/2673492.aspx
由文章标题可知 我们要建立数据库和索引。
一,定义Product类
此类相当于MVC中的容器装载了数据库和索引所需要的对象,例如:category、name、type、content、summary、imageURI、originalRrl、updatedtime。顺序没关系,代码如下:
package com.luceneheritrixbook.core;
public class Product {
private String category=null;
private String name=null;
private String type=null;
private String content=null;
private String summary=null;
private String imageURI=null;
private String updatedtime=null;
private String originalUrl=null;
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getImageURI() {
return imageURI;
}
public void setImageURI(String imageURI) {
this.imageURI = imageURI;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOriginalUrl() {
return originalUrl;
}
public void setOriginalUrl(String originalUrl) {
this.originalUrl = originalUrl;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getUpdatetime() {
return updatedtime;
}
public void setUpdatetime(String updatetime) {
this.updatedtime = updatetime;
}
}二:定义Lucene的Document格式(即用于搜索的field域)
package com.luceneheritrixbook.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import com.luceneheritrixbook.core.Product;
public class ProductDocument {
private static final String PRODUCT_ID="productid";
private static final String INDEX_TIME="indextime";
private static final String PRODUCT_URL="productrul";
private static final String CATEGORY="category";
private static final String PRODUCT_NAME="name";
private static final String PRODUCT_TYPE="type";
public static Document buildProductDocument(Product product,int id)
{
Document doc=new Document();
Field identifier=new Field(PRODUCT_ID,id+"",Field.Store.YES,
Field.Index.UN_TOKENIZED);
long mills=System.currentTimeMillis();
Field indextime=new Field(INDEX_TIME,mills+"",Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field producturl=new Field(PRODUCT_URL,product.getOriginalUrl(),Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field category=new Field(CATEGORY,product.getCategory(),Field.Store.YES,
Field.Index.TOKENIZED);
Field name=new Field(PRODUCT_NAME,product.getName(),Field.Store.YES,
Field.Index.TOKENIZED);
Field type=new Field(PRODUCT_TYPE,product.getType(),Field.Store.YES,
Field.Index.TOKENIZED);
String text=product.getCategory();
text+=" "+product.getName();
text+=" "+product.getType();
Field all=new Field(PRODUCT_ID,text,Field.Store.YES,
Field.Index.TOKENIZED);
doc.add(identifier);
doc.add(indextime);
doc.add(producturl);
doc.add(category);
doc.add(name);
doc.add(type);
doc.add(all);
return doc;
}
}
三、对数据库进行操作(即向数据库中插入获得的product对象)
package com.luceneheritrixbook.database;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import com.luceneheritrixbook.core.Product;
public class ProductJDBC {
private Connection con = null;
private Statement stmt = null;
private ResultSet rs = null;
private PreparedStatement pstmt = null;
private boolean autoCommit = true;
public ProductJDBC(String url, String usr, String pwd) throws Exception
{
Class.forName("com.mysql.jdbc.Driver").newInstance();
con = DriverManager.getConnection(url, usr, pwd);
con.setAutoCommit(autoCommit);
}
public int addProduct(Product p) throws Exception
{
int nextid = getNextId();
if (nextid < 0) {
throw new Exception("Can't get next id.");
}
String content=p.getContent();
String summary=p.getSummary();
String imageURI=p.getImageURI();
String originalUrl=p.getOriginalUrl();
String category=p.getCategory();
String name=p.getName();
String type=p.getType();
String updatetime=p.getUpdatetime();
String expr="insert into product(content,abstractcontent,url," +
"imageurl,category,name,type,updatedtime)values(?,?,?,?,?,?,?,?)";
pstmt=con.prepareStatement(expr);
pstmt.setString(1, content);
pstmt.setString(2, summary);
pstmt.setString(3, originalUrl);
pstmt.setString(4, imageURI);
pstmt.setString(5, category);
pstmt.setString(6, name);
pstmt.setString(7, type);
pstmt.setString(8, updatetime);
pstmt.execute();
return nextid;
}
private int getNextId() throws Exception {
int result = -1;
String sql = "select max(id)+1 from product";
stmt = con.createStatement();
rs = stmt.executeQuery(sql);
while (rs.next()) {
result = rs.getInt(1);
}
return result;
}
public void close()
{
if(con!=null)
{
try
{
con.close();
}
catch(Exception e)
{
e.printStackTrace();
}
finally
{
con=null;
}
}
}
}
/*
* 在这里我发现了一个不好的地方,那就是完全相同的两个产品信息可以同时存入数据库
* */四、对索引进行操作(其实就是把前面所构建的词库加入JE分词,然后连同Document一起加入索引器)
package com.luceneheritrixbook.index;
import java.io.FileReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import com.luceneheritrixbook.core.Product;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductIndexer
{
private String indexPath="";
private IndexWriter writer=null;
private Analyzer analyzer=null;
private String dictionary_file=PropertyConfiguration.getWordDictionary();
public ProductIndexer(String indexPath)throws Exception
{
this.indexPath=indexPath;
initialize();
}
private void initialize() throws Exception
{
analyzer=new MMAnalyzer();
FileReader reader=new FileReader(dictionary_file);
((MMAnalyzer)analyzer).addDictionary(reader);
writer=new IndexWriter(indexPath,analyzer,true);
}
public void close()
{
try
{
writer.close();
}
catch(Exception e)
{
e.printStackTrace();
writer=null;
}
}
public void addProduct(Product product,int id)throws Exception
{
writer.addDocument(ProductDocument.buildProductDocument(product,id));
}
//优化索引
public void optimizeIndex()throws Exception
{
writer.optimize();
}
}
五、调用数据库处理类和索引处理类(这是建立数据库和索引最主要的类,主要过程是这样的:首先初始化数据库和索引的实例,然后是从heritix中读取的镜像网页,通过File的循环遍历从中读取每一个product的详细信息,然后生成一个Product对象,这样通过参数product就可以把数据存入数据库和索引了)代码如下:
package com.luceneheritrixbook.core;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductTextFileProcessor
{
/**
* @param args
*/
private String[] directionaries;
private static final String dbUrl=PropertyConfiguration.getDBUrl();
private static final String dbUsr=PropertyConfiguration.getDBUsr();
private static final String dbPwd=PropertyConfiguration.getDBPwd();
private static final String indexPath=PropertyConfiguration.getIndexStorePath();
private ProductJDBC productJDBC=null;
private ProductIndexer indexer=null;
public final static int SUMMARY_LENGTH=80;//内容简介的最大数量
public ProductTextFileProcessor()
{
initialize();
}
public void initialize()
{
try
{
productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
indexer=new ProductIndexer(indexPath);
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void setDirectionaries(String[] directionaries)
{
this.directionaries=directionaries;
}
protected void process()throws Exception
{
if(productJDBC==null)
{
throw new Exception("Database connection failed,pls retry");
}
if(indexer==null)
{
throw new Exception("Lucene index failed,pls retry");
}
if(directionaries==null||directionaries.length==0)
{
System.out.print("失败了");
return;
}
try
{
for(int i=0;iSUMMARY_LENGTH)
{
p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
}
else
p.setSummary(contentstr);
p.setUpdatetime(updatetime);
//以上一个product对象已存在
//先存入数据库,然后h获得返回的id值;
int nextid=insert2DB(p);//这里出现了错误,其实还是ProductJDBC.java里面出现了错误
//用刚返回的id值,向索引中加入Product对象
buildIndex(p,nextid);
}
//索引优化
optimizeindex();
/*这只是一个函数,不能直接用来优化索引,不知道为什么
* 不直接用indexer.optimizeIndex();
*/
}
protected int insert2DB(Product p)throws Exception
{
return productJDBC.addProduct(p);
}
protected void buildIndex(Product p,int nextid)throws Exception
{
indexer.addProduct(p,nextid);
}
//优化所以你
private void optimizeindex()throws Exception
{
indexer.optimizeIndex();
}
private void closeIndex()throws Exception
{
indexer.close();
}
private void closeDB()
{
productJDBC.close();
}
/* public String getDbPwd()
{
return dbPwd;
}
public String getDbUrl()
{
return dbUrl;
}
public String getDbUsr()
{
return dbUsr;
}
public String getIndexPath()
{
return indexPath;
}*/
//上述方法书上有,但我看来看去,发现它好像也没什么用,就暂时给冻结了,好像也没报错。
public static void main(String[] args) throws Exception
{
// TODO Auto-generated method stub
ProductTextFileProcessor pro=new ProductTextFileProcessor();
pro.initialize();//前面已经有了,不知道是不是多此一举。
String path1="c://product//mobile//";
pro.setDirectionaries(new String[]{path1});//这句到底是什么意思
pro.process();
}
}数据库还好说,以后肯定要用到,可是这个索引到底有什么用啊,好像后面没用到,不过我猜肯定是我弄</STRONG>错了,怎么可能会用不到,开玩笑嘛,等着看吧 ……
注:先第五发现有错误,修改如下:
package com.luceneheritrixbook.core;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductTextFileProcessor
{
/**
* @param args
*/
private String[] directionaries;
private static final String dbUrl=PropertyConfiguration.getDBUrl();
private static final String dbUsr=PropertyConfiguration.getDBUsr();
private static final String dbPwd=PropertyConfiguration.getDBPwd();
private static final String indexPath=PropertyConfiguration.getIndexStorePath();
private ProductJDBC productJDBC=null;
private ProductIndexer indexer=null;
public final static int SUMMARY_LENGTH=80;//到底有什么用呢
public ProductTextFileProcessor()
{
initialize();
}
public void initialize()
{
try
{
productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
indexer=new ProductIndexer(indexPath);
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void setDirectionaries(String[] directionaries)
{
this.directionaries=directionaries;
}
protected void process()throws Exception
{
if(productJDBC==null)
{
throw new Exception("Database connection failed,pls retry");
}
if(indexer==null)
{
throw new Exception("Lucene index failed,pls retry");
}
if(directionaries==null||directionaries.length==0)
{
System.out.print("失败了");
return;
}
try
{
for(int i=0;i<directionaries.length;i++)
{
File f=new File(directionaries[i]);
traverse(f);
}
//处理完成后关闭数据库
closeDB();
//处理完成后关闭索引器
closeIndex();
}
catch(Exception e)
{
e.printStackTrace();
}
}
protected void traverse(File file)throws Exception
{
String[] files=file.list();
for(int i=0;i<files.length;i++)
{
File productfile=new File(file,files[i]);
String fname=productfile.getName();
System.out.println(fname);
BufferedReader reader=new BufferedReader(new FileReader(productfile));
String url=reader.readLine();
String name=reader.readLine();
String type=reader.readLine();
String imageURI="";
String updatetime=fname.substring(fname.lastIndexOf("-")+1,fname.lastIndexOf("."));
StringBuffer content=new StringBuffer();
String line=reader.readLine();
while(line!=null&&!line.equals(Extractor.SEPARATOR))//&&!line.equals(Extractor.SEPARATOR) 难道调用了前面的东西
{
content.append(line).append("/r/n");
line=reader.readLine();
}
imageURI=reader.readLine();
//生成并设置"一个"product对象
Product p=new Product();
p.setCategory("手机");
p.setName(name);
p.setType(type);
p.setImageURI(imageURI);
//p.setContent(content);//为什么会出错呢?
p.setOriginalUrl(url);
String contentstr=content.toString();
p.setContent(contentstr);
if(contentstr.length()>SUMMARY_LENGTH)
{
p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
}
else
p.setSummary(contentstr);
p.setUpdatetime(updatetime);
//以上一个product对象已存在
//先存入数据库,然后h获得返回的id值;
int nextid=insert2DB(p);//这里出现了错误,其实还是ProductJDBC.java里面出现了错误
//用刚返回的id值,向索引中加入Product对象
buildIndex(p,nextid);
}
//索引优化
optimizeindex();
/*这只是一个函数,不能直接用来优化索引,不知道为什么
* 不直接用indexer.optimizeIndex();
*/
}
protected int insert2DB(Product p)throws Exception
{
return productJDBC.addProduct(p);
}
protected void buildIndex(Product p,int nextid)throws Exception
{
indexer.addProduct(p,nextid);
}
//优化所以你
private void optimizeindex()throws Exception
{
indexer.optimizeIndex();
}
private void closeIndex()throws Exception
{
indexer.close();
}
private void closeDB()
{
productJDBC.close();
}
/* public String getDbPwd()
{
return dbPwd;
}
public String getDbUrl()
{
return dbUrl;
}
public String getDbUsr()
{
return dbUsr;
}
public String getIndexPath()
{
return indexPath;
}*/
//上述方法书上有,但我看来看去,发现它好像也没什么用,就暂时给冻结了,好像也没报错。
public static void main(String[] args) throws Exception
{
// TODO Auto-generated method stub
ProductTextFileProcessor pro=new ProductTextFileProcessor();
pro.initialize();//前面已经有了,不知道是不是多此一举。
String path1="c://product//mobile//";
pro.setDirectionaries(new String[]{path1});//这句到底是什么意思
pro.process();
}
}
本文来自优快云博客,转载请标明出处:http://blog.youkuaiyun.com/caoxu1987728/archive/2008/07/18/2673492.aspx