菲度垂直搜索引擎 代码注释 1

本文介绍了一个垂直搜索引擎的基础架构实现过程,包括初始化数据库连接、启动多线程进行数据抓取及处理,并最终完成索引建立。文章详细展示了如何从数据库读取初始URL,创建线程池进行网页或RSS源的抓取,以及如何处理抓取过程中遇到的错误。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Imports System.Data.SqlClient
Imports System.Data
Imports System.IO
Imports Microsoft.VisualBasic
'胡光伟写与2005-08-12
'垂直搜索引擎

Module Main
    Public DoneEvent As New ManualResetEvent(False)
    Public DWork() As Boolean
    Public Qerror As New Queue(1000, 1.5)
    Public Qseach() As Queue
    Public Sn, Psn As Integer
    Public GcountinitWorkEvernt As Integer
    Sub Main()
        Try
            ' QueryData("rssindex")
         
          
            Dim mywait As String
            Dim sl As UNode()
            Console.WriteLine("Begin......")
            initDb()
            Dim flag As Integer = 0

'initurl 为初始联结地址,stype为R代表对其进行rss的分析,W进行网页的分析
            Dim cmd As SqlCommand = New SqlCommand("select * from initurl where id between 1 and 5 order by id desc", Groblecon)
            Dim cmdcount As SqlCommand = New SqlCommand("select count(*) from initurl ", Groblecon)
            Dim initds As New DataSet
            Dim initdapter As New SqlDataAdapter(cmd)
            initdapter.Fill(initds, "initdata")
            ReDim sl(initds.Tables!initdata.Rows.Count - 1)
            ReDim DWork(initds.Tables!initdata.Rows.Count - 1)
            ReDim Qseach(initds.Tables!initdata.Rows.Count - 1)
           
            GcountinitWorkEvernt = CInt(initds.Tables!initdata.Rows.Count - 1)
            cmdcount.Dispose()
            Dim ij As Integer
            For ij = 0 To GcountinitWorkEvernt
                DWork(ij) = False
                Qseach(ij) = New Queue
            Next
            For ij = 0 To GcountinitWorkEvernt
                With initds.Tables!initdata.Rows.Item(ij)
                    Dim un As New UNode(-1, Sn, 0, .Item("weburl"), False, .Item("rank"), .Item("utype"), Trim(.Item("encode")))
                    un.Tiltle = .Item("title")
                    un.Ext = .Item("Ext")
                    un.Rank = .Item("Rank")
                    un.Content = .Item("content")
                    un.InitStep = .Item("step")
                    un.MetaValue = .Item("meta")
                    un.SearchType = .Item("stype")
                    un.TemplateId = CInt(.Item("temid"))
                    un.MaxGet = CInt(.Item("maxcount"))
                    un.MaxDeepLevel = CInt(.Item("deeplevel"))
                    sl(ij) = un
                    'Console.WriteLine(un.SearchType)
                End With

            Next

每个id的初始地址开始新的线程,请不要超过25个

 

            Dim c As Integer
            For c = 0 To UBound(sl)
                Interlocked.Increment(Sn)
                Dim vobj As New VisitObject(sl(c).MaxGet, c, sl(c))
                ThreadPool.QueueUserWorkItem(New WaitCallback(AddressOf vobj.Vall), sl(c))
            Next
            DoneEvent.WaitOne()

 

 

 

 

            Console.WriteLine("ALL FINISHED.")
            Console.WriteLine("FtG......")
            While Qerror.Count <> 0
                Dim ua As UNode = CType(Qerror.Dequeue, UNode)
                Console.WriteLine(ua.Address)
            End While


            OptizeDataBase()

‘'搜索到信息在队列中,现在入库,对RSS信息和web信息分开
            AddInfoItem()
            Console.WriteLine("Index DataBase.")

’使用lucene进行处理,生成反排的索引文件
            IndexDoc()

            Console.WriteLine("Finish Index DataBase.")
            mywait = Console.ReadLine
            While mywait <> "exit"
                mywait = Console.ReadLine
            End While
            CloseDb()
        Catch ex As Exception
            Console.WriteLine(ex.ToString)
        End Try

 


      
    End Sub
 

End Module

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值