Python学习打卡--day32（练习：简单搜索引擎）

最新推荐文章于 2022-10-14 21:32:23 发布

hengxiaogu

最新推荐文章于 2022-10-14 21:32:23 发布

阅读量243

点赞数

分类专栏： python学习打卡文章标签： python

本文链接：https://blog.youkuaiyun.com/hengxiaogu/article/details/90967633

版权

python学习打卡专栏收录该内容

58 篇文章

订阅专栏

基础类

"""
SearchEngineBase可以被继承，继承的类分别代表不同的算法引擎；
每一个引擎都应该实现process_corpus 和 search 函数
"""


class SearchEngineBase(object):
    def __init__(self):
        pass

    # 添加语料：读取文件内容，将文件路径作为ID，连同内容一起送到process_corpus中
    def add_corpus(self, file_path):
        with open(file_path, 'r') as fin:
            text = fin.read()
            # print(text)
        self.process_corpus(file_path, text)

    # 需要对内容进行处理，然后文件路径为ID，将处理后的内容存下来。处理后的内容，就叫做索引？
    def process_corpus(self, id, text):
        raise Exception('process_corpus not implemented.')

    # 给定一个询问，处理询问，再通过索引检索，然后返回
    def search(self, query):
        raise Exception('search not implemented.')


def main(search_engine):
    for file_path in ['./1.txt', './2.txt', './3.txt', './4.txt', './5.txt']:
        search_engine.add_corpus(file_path)

    while True:
        query = input("请输入关键词：")
        results = search_engine.search(query)
        print('fond {} results :'.format(len(results)))
        for results in results:
            print(results)

最简单的搜索引擎

"""
最简单的搜索引擎，通过输入的关键词找到所在文件路径，并且统计返回
不足：只针对单个单词判断，每次检索占用大量时间
"""
from test06.test0605.search_base import *


class SimpleEngine(SearchEngineBase):
    def __init__(self):
        super(SimpleEngine, self).__init__()
        self.__id_to_text = {}

    # 将文件内容插入字典中
    def process_corpus(self, id, text):
        self.__id_to_text[id] = text  # {文件路径：文件内容}

    # 枚举字典，如果找到要搜索的字符串，如果能找到，则将id放到结果表中，最后返回
    def search(self, query):
        results = []
        for id, text in self.__id_to_text.items():
            if query in text:
                results.append(id)
        return results


search_engine = SimpleEngine()
main(search_engine)

优化后的搜索引擎

"""
搜索引擎：支持搜索多个词，返回所在文件位置
不足：没有考虑输入字符串单词顺序；每次查询遍历所有的id
"""
from test06.test0605.search_base import *
import re


class BOWEngine(SearchEngineBase):
    def __init__(self):
        super(BOWEngine, self).__init__()
        self.__id_to_words = {}

    # 将文件内容插入字典中
    def process_corpus(self, id, text):
        self.__id_to_words[id] = self.parse_text_to_words(text)  # {文件路径：文件内容集合}

    def search(self, query):
        query_words = self.parse_text_to_words(query)  # 输入单词串转成集合
        # print(query_words)  # {'have', 'i', 'dream', 'a'}
        results = []
        for id, words in self.__id_to_words.items():
            if self.query_match(query_words, words):
                results.append(id)
        return results

    # 判断输入字符串中的所有单词是否都在文件内容集合中
    @staticmethod
    def query_match(query_words, words):
        for query_words_sub in query_words:
            if query_words_sub not in words:
                return False
        return True

    @staticmethod
    def parse_text_to_words(text):
        # 使用正则去除标点符号和换行符
        text = re.sub(r'[^\w]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的set,单词去重
        return set(word_list)


search_engine = BOWEngine()
main(search_engine)