搜索引擎原理简介和简单实现

本文介绍了一个简易搜索引擎的工作原理,包括数据收集与信息提取两大模块。数据收集通过爬虫抓取网页内容并将其分词,存储到数据库中;信息提取则根据关键词检索URL,并通过多种算法如单词频度、位置等来计算排名。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

搜索引擎两个模块 1 数据收集 2 信息提取

数据收集

数据收集使用爬虫算法。将爬虫素搜索的所有的的内容切分成单词,然后将单词和网址的关系存放在数据库中。

  • 编写爬虫可以使用urllib2
  • 提取网页信息可以使用beautifulsoup
  • 存储网页信息可以使用sqlite.

信息提取

数据查根据关键词,查找符合条件的URL。然后定义根据评估算法,计算URL的排名,排名最靠前的排在第一位。排名算法衡量的因子有以下几种:

  • 单词频度
  • 单词出现的位置(一般靠前的都是主题)
  • 单词之间的距离 网页被引用的次数
  • pageRank(用户点击任意次数到达这个网页的可达性,网页初始化因子可以相同,经过几次迭代,慢慢趋近合理值)
  • 链接文本。

Demo代码

这个搜索引擎只是一个简单的demo,用来展现基本搜索引擎的原理,百度和谷歌最大的难点在于海量数据处理。

import urllib2
from bs4 import BeautifulSoup
from  urlparse import   urljoin
import  sqlite3
import  re

ignoreWord =set(['the','of','to','and','it','in'])
class crawker:
    def __init__(self):
        self.db = sqlite3.connect("search.db")
    def __del__(self):
        self.db.close()
    def dbCommit(self):
        self.db.commit()
    def createIndexTable(self):
        self.db.execute('create table if not exists urlList(url)')
        self.db.execute('create table  if not exists wordList(word)')
        self.db.execute('create table if not exists wordLocation(urlID,wordID,location)')
        self.db.execute('create table  if not exists link(fromId interger,toId INTEGER)')
        self.db.execute('create table if not exists  linkWords(word,link)')

    def getEnteryId(self,table,field,value,createnew=True):
        cur = self.db.execute("select rowid from %s where %s='%s'"%(table,field,value))
        res = cur.fetchone()
        if res == None:
            cur = self.db.execute('insert into %s(%s) VALUES("%s")' %(table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    def addToIndex(self,url,soup):
        if self.isIndexed(url): return
        print 'Indexing %s' %url
        text = self.getTextOnly(soup)
        words = self.separate(text)
        urlid = self.getEnteryId("urlList","url",url)
        for i in range(len(words)):
            word = words[i]
            if word in ignoreWord:continue
            wordid = self.getEnteryId('wordList','word',word)
            self.db.execute('insert into wordLocation VALUES(%d,%d,%d)' %(urlid,wordid,i))
    def getTextOnly(self,soup):
        v = soup.string
        if v == None:
            c = soup.contents
            results = ''
            for t in c:
                subText = self.getTextOnly(t)
                results = results + subText + '\n'
            return  results
        else:
            return  v.strip()
    def separate(self,text):
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!=""]
    def isIndexed(self,url):
        u = self.db.execute("select rowid from urlList where url='%s'" %url).fetchone()
        if u != None:
            v = self.db.execute("select rowid from wordLocation where urlId=%d" %u[0]).fetchone()
            if v != None: return  True
        else:
            return  False
    def addLinkRef(self,urlFrom,urlTo,linkText):
        pass
    def crawl(self,pages,depth=3):
        for i in range(depth):
            newpages=set()
            for page in pages:
                try:
                    c = urllib2.urlopen(page)
                except:
                    print "could not open %s" %page
                soup = BeautifulSoup(c.read(),'lxml')
                self.addToIndex(page,soup)

                links = soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url = urljoin(page,link['href'])
                        if url.find("'") != -1: continue
                        url = url.split('#')[0]
                        if url[0:4] == "http" and not self.isIndexed(url):
                            newpages.add(url)
                        linkText = self.getTextOnly(link)
                        self.addLinkRef(page,url,linkText)
                self.dbCommit()
            pages = newpages

class searcher:
    def __init__(self):
        self.db = sqlite3.connect("search.db")
    def __del__(self):
        self.db.close()
    def getMatchRows(self,q):
        fieldList='w0.urlID'
        tableList=''
        clauseList=''
        wordids=[]
        words = q.split(' ')
        tableNumber = 0
        for word in words:
            wordRow = self.db.execute("select rowid from wordList where word='%s'" %word).fetchone()
            if wordRow != None:
                wordId = wordRow[0]
                wordids.append(wordId)
                if tableNumber>0:
                    tableList+=','
                    clauseList += ' and '
                    clauseList += 'w%d.urlID=w%d.urlID and ' %(tableNumber-1,tableNumber)
                fieldList+=',w%d.location' %tableNumber
                tableList+= 'wordLocation w%d' %tableNumber
                clauseList+='w%d.wordID=%d' %(tableNumber,wordId)
                tableNumber+=1
        #example: select w0.urlID,w0.location,w1.location from wordLocation w0,wordLocation w1 where w0.wordID=2 and w0.urlID=w1.urlID and w1.wordID=37
        fullQuery = 'select %s from %s where %s' %(fieldList,tableList,clauseList)
        print  fullQuery
        cur = self.db.execute(fullQuery)
        rows = [row for row in cur]
        #urlid  word1position  word2postion
        return  rows,wordids
    def getScoredList(self,rows,wordids):
        totalScores = dict([(row[0],0) for row in rows])
        weights = [(1.0,self.frequencyScores(rows))]
        for (weight,scores) in weights:
            for url in totalScores:
                totalScores[url] += weight*scores[url]
        return  totalScores
    def  getUrlName(self,id):
        return  self.db.execute('select url from urlList where rowid=%d' %id).fetchone()[0]
    def query(self,q):
        rows,wordids = self.getMatchRows(q)
        scores = self.getScoredList(rows,wordids)
        randkedScores = sorted([(score,url) for (url,score) in scores.items()],reverse=1)
        for (score,urlid) in randkedScores:
            print '%f %s' %(score,self.getUrlName(urlid))
    def normallizeScores(self,scores,smallIsBetter=0):
        vsmall = 0.00001
        if smallIsBetter :
            minScore = min(scores.values())
            return  dict([(u,float(minScore)/max(vsmall,l)) for (u,l) in scores.items()])
        else:
            maxScore = max(scores.values())
            if maxScore == 0:maxScore = vsmall
            return dict([(u,float(c)/maxScore) for (u,c) in scores.items()])
    def  frequencyScores(self,rows):
        counts = dict([(row[0],0) for row in rows])
        for row in rows: counts[row[0]] += 1
        return  self.normallizeScores(counts)

if __name__ == '__main__':
    crawer = crawker()
    #crawer.createIndexTable()
    #crawer.crawl(["http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html"])
    se = searcher();
    result = se.query("python xml")
    print  result

参考书籍

  • 集体智慧编程
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值