IndexManager

lunece简单的创建索引和搜索功能的实现

需要加载一下几个包
这里写图片描述

package com.cn.shupu.util;
import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.nio.file.Paths;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.highlight.Fragmenter;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleFragmenter;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class IndexManager {

private static String content = "";



private static List<File> filelist = new ArrayList<File>();



public static void createIndex(String sourcePath, String indexPath) {



    //sourcePath是要索引文件的路径;indexPath是存放生成索引文件的路径

    File fileDir = new File(sourcePath);



    /**/ /* 这里放索引文件的位置 */

    File indexDir = new File(indexPath);



    Date date1 = new Date();



    List<File> fileList = getFileList(sourcePath);



    for (File file : fileList) {



        content = "";

        // 获取文件后缀

        String type = file.getName().substring(file.getName().lastIndexOf(".") + 1);



        if ("txt".equalsIgnoreCase(type)) {



            String filePath = file.getAbsolutePath();



            int index = filePath.lastIndexOf(File.separator);

            filePath = filePath.substring(0, index);

            if (filePath.contains("txt")) {

                content += txt2String(file);

            }



        }



        Directory dir;



        try {

            dir = FSDirectory.open(Paths.get(indexPath));// 这里的路径为保存索引的路径

            // 创建一个保存索引的文件路径

            Analyzer analyzer = new SmartChineseAnalyzer();// 创建一个分词器

            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);// 创建一个索引生成器

            IndexWriter writer = new IndexWriter(dir, iwc);



            Document document = new Document();

            document.add(new TextField("filename", file.getName(), Store.YES));

            document.add(new TextField("content", content, Store.YES));

            document.add(new TextField("path", file.getPath(), Store.YES));

            writer.addDocument(document);

            writer.commit();

            writer.close();



        } catch (IOException e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }



        content = "";

    }



    Date date2 = new Date();

    System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");



}



/**

 * 121 * 读取txt文件的内容 122 * @param file 想要读取的文件对象 123 * @return 返回文件内容 124

 */

public static String txt2String(File file) {

    String result = "";

    try {

        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));

        String s = null;

        while ((s = br.readLine()) != null) {// 使用readLine方法,一次读一行



            result = result + "\n" + s;



        }

        br.close();

    } catch (Exception e) {

        e.printStackTrace();

    }

    return result;

}



/**

 * 过滤目录下的文件

 * 

 * @param dirPath

 *            想要获取文件的目录

 * @return 返回文件list

 */

public static List<File> getFileList(String dirPath) {



    File[] files = new File(dirPath).listFiles();



    for (File file : files) {



        if (file.isDirectory()) {



            getFileList(file.getAbsolutePath());



        } else if (file.isFile()) {



            if (isTxtFile(file.getName())) {



                filelist.add(file);

            }

        }



    }

    return filelist;



}



/**

 * 236 * 判断是否为目标文件,目前支持txt xls doc格式 237 * @param fileName 文件名称 238

 * * @return 如果是文件类型满足过滤条件,返回true;否则返回false 239

 */

public static boolean isTxtFile(String fileName) {

    if (fileName.lastIndexOf(".txt") > 0) {

        return true;

    } else {

        return false;

    }

}



/**

 * 187 * 查找索引,返回符合条件的文件 188 * @param text 查找的字符串 189 * @return 符合条件Map集合 190

 */

public static Map<String, String> searchIndex(String text, String indexPath) {



    if (text == null)

        return null;

    Date date1 = new Date();



    Map<String, String> rs = new HashMap<String, String>();



    Directory directory;

    try {

        directory = FSDirectory.open(Paths.get(indexPath));

        Analyzer analyzer = new SmartChineseAnalyzer();

        DirectoryReader ireader = DirectoryReader.open(directory);

        IndexSearcher isearcher = new IndexSearcher(ireader);



        QueryParser parser = new QueryParser("content", analyzer);

        Query query = parser.parse(text);



        TopDocs results = isearcher.search(query, 100);



        ScoreDoc[] hits = results.scoreDocs;



        for (int i = 0; i < hits.length; i++) {



            Document hitDoc = isearcher.doc(hits[i].doc);



            String content = hitDoc.get("content");



            // 文件路径

            String path = hitDoc.get("path");

            // 查询字段高亮操作 其中300为字符长度可以自动修改的;

            String s = displayHtmlHighlight(query, analyzer, "content", content, 180);



            // System.out.println("____________________________");

            //

            // System.out.println("高亮:---------" + s);

            // System.out.println(hitDoc.get("filename"));

            // // System.out.println(hitDoc.get("content"));

            // System.out.println(hitDoc.get("path"));

            // System.out.println("____________________________");



            rs.put(path, s);



        }

        ireader.close();

        directory.close();

    } catch (Exception e) {

        e.printStackTrace();

    }

    Date date2 = new Date();

    System.out.println("查看索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");

    return rs;

}



/**

 * 获取高亮显示结果的html代码

 * 

 * @param query

 *            查询

 * @param analyzer

 *            分词器

 * @param fieldName

 *            域名

 * @param fieldContent

 *            域内容

 * @param fragmentSize

 *            结果的长度(不含html标签长度)

 * @return 结果(一段html代码)

 * @throws IOException

 * @throws InvalidTokenOffsetsException

 */

static String displayHtmlHighlight(Query query, Analyzer analyzer, String fieldName, String fieldContent,

        int fragmentSize) throws IOException, InvalidTokenOffsetsException {

    // 创建一个高亮器

    Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<font color='red'>", "</font>"),

            new QueryScorer(query));

    Fragmenter fragmenter = new SimpleFragmenter(fragmentSize);

    highlighter.setTextFragmenter(fragmenter);

    return highlighter.getBestFragment(analyzer, fieldName, fieldContent);

}

}

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 索引管理模块 提供向量索引和过滤字段索引的统一管理功能 """ import logging from typing import Dict, Any, Optional from pymilvus import Collection logger = logging.getLogger(__name__) class IndexManager: """索引管理器""" def __init__(self, collection: Collection): self.collection = collection self.index_created = False def create_vector_index(self, field_name: str = "embedding", index_type: str = "HNSW", metric_type: str = "COSINE", **kwargs) -> bool: try: # 默认HNSW参数 default_params = { "M": 16, "efConstruction": 200 } # 合并用户参数 params = {**default_params, **kwargs} index_params = { "index_type": index_type, "metric_type": metric_type, "params": params } self.collection.create_index( field_name=field_name, index_params=index_params ) logger.info(f"为字段 '{field_name}' 创建了 {index_type} 索引 ({metric_type})") return True except Exception as e: logger.error(f"创建向量索引失败: {e}") return False def create_scalar_index(self, field_name: str, index_type: str = "Trie") -> bool: try: index_params = { "index_type": index_type } self.collection.create_index( field_name=field_name, index_params=index_params ) logger.info(f"为字段 '{field_name}' 创建了 {index_type} 索引") return True except Exception as e: logger.error(f"创建标量索引失败: {e}") return False def create_default_indexes(self, vector_field: str = "embedding", scalar_fields: Optional[list] = None) -> bool: try: # 创建向量索引 if not self.create_vector_index(vector_field): return False # 创建标量字段索引 if scalar_fields: for field in scalar_fields: if not self.create_scalar_index(field): return False self.index_created = True logger.info("默认索引配置创建完成") return True except Exception as e: logger.error(f"创建默认索引配置失败: {e}") return False def drop_index(self, field_name: str) -> bool: try: self.collection.drop_index(field_name) logger.info(f"已删除字段 '{field_name}' 的索引") return True except Exception as e: logger.error(f"删除索引失败: {e}") return False def has_index(self, field_name: str) -> bool: """ 检查字段是否有索引 :param field_name: 字段名称 :return: 是否有索引 """ try: # 使用indexes属性来检查索引,避免AmbiguousIndexName异常 indexes = self.collection.indexes for index in indexes: if index.field_name == field_name: return True return False except Exception as e: logger.error(f"检查索引状态失败: {e}") return False def get_index_info(self) -> Dict[str, Any]: """ 获取索引信息 :return: 索引信息字典 """ try: # 使用正确的API获取索引信息 index_info = {} # 获取集合的字段信息 schema = self.collection.schema for field in schema.fields: try: # 检查字段是否有索引,使用安全的方法避免AmbiguousIndexName异常 if self.has_index(field.name): # 获取索引信息,处理多重索引情况 try: # 尝试获取索引信息,但处理AmbiguousIndexName异常 index = self.collection.index(field_name=field.name) index_info[field.name] = { "index_type": getattr(index, '_index_type', 'unknown'), "metric_type": getattr(index, '_metric_type', None), "params": getattr(index, '_params', {}) } except Exception as index_error: # 处理AmbiguousIndexName异常和其他异常 if "AmbiguousIndexName" in str(index_error): # 存在多个索引,获取所有索引的信息 try: indexes = self.collection.indexes field_indexes = [idx for idx in indexes if idx.field_name == field.name] if field_indexes: # 取第一个索引的信息 first_index = field_indexes[0] index_info[field.name] = { "index_type": getattr(first_index, '_index_type', 'unknown'), "metric_type": getattr(first_index, '_metric_type', None), "params": getattr(first_index, '_params', {}), "multiple_indexes": True } else: index_info[field.name] = { "index_type": "unknown", "error": "无法获取索引信息" } except Exception as inner_error: index_info[field.name] = { "index_type": "unknown", "error": f"获取索引信息失败: {inner_error}" } else: # 其他异常 logger.warning(f"获取字段 '{field.name}' 的索引信息时出错: {index_error}") index_info[field.name] = { "index_type": "unknown", "error": str(index_error) } except Exception: # 如果检查索引存在性失败,跳过该字段 continue return index_info except Exception as e: logger.error(f"获取索引信息失败: {e}") return {} 这是我构建索引的代码,也是导致 AmbiguousIndexName: (code=1, message=There are multiple indexes, please specify the index_name.)出现的主要原因,我现在没什么好的构建索引的方案和防止这个错误出现的方法,你能给我一个系统的可以让人容易理解的说下,我该如何做吗
最新发布
11-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值