菲度垂直搜索引擎 代码注释 4

本文介绍了一个基于线程池的任务调度系统实现,重点在于如何利用队列管理和多线程来高效抓取网页信息,并解析有用的数据。该系统通过初始化工作线程,将待处理的URL入队,并在完成抓取后将结果放入另一个队列,以便进一步处理。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

’线程池中每个线程对应一个初始的地址,抓到有用的信息后入队

Imports System.Text.RegularExpressions
Imports System.Text
Imports System.Data.SqlClient
Imports System.Data
Imports System.Threading
Class VisitObject
    Dim Qin As Queue
    Dim Qfin As Queue
    Dim Qdirectory As New Queue(20, 1.5)
    Dim MacCount As Integer
    Dim mStep As Integer
    Dim mflag As Integer
    Dim minitnode As UNode
    Dim usefulext As String
    Function GetFirstElement(ByVal q As Queue) As UNode
        GetFirstElement = CType(q.Peek, UNode)
    End Function

    Sub New(ByVal IC As Integer, ByVal flag As Integer, ByVal initnode As UNode)
        MacCount = IC
        mflag = flag
        minitnode = initnode
        Console.WriteLine(GcountinitWorkEvernt)
        Qfin = New Queue(MacCount)
        Qin = New Queue(MacCount)
        usefulext = initnode.Ext
        mStep = initnode.InitStep
    End Sub
    Sub Vall(ByVal uni As Object)
        SeachPage(CType(uni, UNode))
       

        While Qfin.Count > 0
            AddItemFromread(CType(Qfin.Dequeue, UNode))
        End While

        DWork(mflag) = True
        If CheckTaskDone() Then
            DoneEvent.Set()
        End If
    End Sub

    Function SeachPage(ByVal un As UNode) As Boolean
        Dim i As Integer
        Dim temp As String
        For i = 0 To un.MaxDeepLevel Step mStep

‘模拟得到下一页的信息
            temp = Trim(un.Address).Replace("<n>", i)
            'Sub New(ByVal psn As Integer, ByVal sn As Integer, ByVal lel As Integer, ByVal uri As String, ByVal rssflag As Boolean, ByVal boost As Integer, ByVal utype As String, Optional ByVal initmencode As String = "gb2312")
            Dim unsilbiming As New UNode(-1, Sn, 0, temp, False, un.Rank, "Brother", un.EnCode)
            unsilbiming.SearchType = un.SearchType
            unsilbiming.Rank = un.Rank
            unsilbiming.Ext = un.Ext

’对于不同的地址在得到信息的时候,使用不同的模板
            unsilbiming.TemplateId = un.TemplateId
            Qin.Enqueue(unsilbiming)
        Next

        While (Qin.Count <> 0)
            If Qfin.Count <= MacCount Then
                Try
                    visit(GetFirstElement(Qin), 0)
                    Qfin.Enqueue(Qin.Dequeue())
                Catch ex As Exception

                End Try

            Else
                Exit While
            End If
        End While

    End Function

    Function CkeckNode(ByVal adr As String, ByVal unode As UNode) As Boolean
        Try

            If Trim(unode.SearchType) = "W" Then
                If adr.IndexOf(unode.Ext) >= 5 Then
                    CkeckNode = True
                    Exit Function
                Else
                    CkeckNode = False

                End If
                Exit Function
            Else
                If GetRss(adr) Then
                    CkeckNode = True
                    Exit Function
                Else
                    CkeckNode = False
                    Exit Function
                End If

            End If

        Catch ex As Exception
            Console.WriteLine(ex.ToString)
            CkeckNode = False
            Exit Function
        End Try

    End Function


    Function CheckTaskDone() As Boolean
        Dim i As Integer
        For i = 0 To GcountinitWorkEvernt
            If DWork(i) = False Then
                CheckTaskDone = False
                Exit Function
            End If
        Next
        CheckTaskDone = True
    End Function

    Function SingleHref(ByVal inputString As String) As String
        Dim r As Regex
        Dim m As Match
        inputString = inputString.Replace("'", "")
        r = New Regex("href/s*=/s*(""([^""]*)""|(/S+))", _
            RegexOptions.IgnoreCase Or RegexOptions.Compiled)
        m = r.Match(inputString)
        If m.Groups.Count >= 1 Then
            SingleHref = m.Groups(1).ToString.Replace(Chr(34), "")
        Else
            SingleHref = ""
        End If
    End Function
    Function SingleHref2(ByVal inputString As String) As String
        Dim r As Regex
        Dim m As Match
        r = New Regex("href/s*=/s*('([^""]*)')", _
            RegexOptions.IgnoreCase Or RegexOptions.Compiled)
        m = r.Match(inputString)
        If m.Groups.Count >= 1 Then
            SingleHref2 = m.Groups(1).ToString.Replace(Chr(34), "")
        Else
            SingleHref2 = ""
        End If
    End Function
    Function WholeUri(ByVal i As String, ByVal BasicPath As String) As String
        If i.IndexOf("javascript") > -1 Then
            WholeUri = ""
            Exit Function
        End If
        If i.StartsWith("http://") Then
            WholeUri = i.Replace("amp;", "")
            Console.WriteLine("xxxxx " & i)
        Else
            If i.StartsWith("/") Then
                Dim ih As Integer
                ih = BasicPath.IndexOf("/", 10)
                WholeUri = Mid(BasicPath, 1, ih) & i
            Else
                WholeUri = Directo(BasicPath) & i
            End If

        End If
    End Function

    Function Directo(ByVal BasicPath As String) As String
        Dim tempds As String
        Dim i As Integer
        i = BasicPath.LastIndexOf("/")
        If i > 6 Then
            Directo = Mid(BasicPath, 1, i + 1)
        Else
            Directo = BasicPath
        End If
    End Function

    Sub visit(ByRef u As UNode, ByVal trytime As Integer)
        Dim Retry As Integer
        Retry = 0
        If Trim(u.Address) <> "" Then
            Try
                'Sub New(ByRef u As UNode, ByVal flag As String, ByVal Tempid As Integer, Optional ByVal mcide As String = "gb2312")
                Dim mgt As New ClientGetAsync(u, u.Address, u.TemplateId)
                Try
                   
                    '中储信息列表,如果信息不够就回传
                    mgt.GMain(u.Address)
                    While mgt.Von = "" And Retry < 6
                        mgt.GMain(u.Address)
                        Retry = Retry + 1
                        Console.WriteLine("retry {0} ", Retry)
                    End While


                    Try
                        u.Picture = mgt.Picture
                        u.MetaValue = mgt.MyMetaValue
                        u.PagetitleValue = mgt.MyPageTile
                        u.Price = mgt.Price
                        u.Boost = mgt.Boost
                        u.Addtion = mgt.Addtion
                        u.Content = mgt.CText

                    Catch ex As Exception
                        Console.Write(ex.ToString)
                    End Try

                Catch ex As Exception
                    Console.Write(ex.ToString)
                    Qerror.Enqueue(u)
                End Try

                Dim r As Regex
                Dim m As MatchCollection
                Dim hrefs As New StringBuilder("")

                r = New Regex("<a[^<>]+?/>((.|/n)*?)<//a>", _
                    RegexOptions.IgnoreCase Or RegexOptions.Compiled)
                If Trim(mgt.Von) <> "" Then

                    m = r.Matches(mgt.Von)
                    Dim i As Integer
                    For i = 0 To m.Count - 1

                        'Sub New(ByVal psn As Integer, ByVal sn As Integer, ByVal lel As Integer, ByVal uri As String, ByVal rssflag As Boolean, ByVal boost As Integer, ByVal utype As String, Optional ByVal initmencode As String = "gb2312")

                        Interlocked.Increment(Sn)
                        Dim usearch As UNode
                        If CkeckNode(WholeUri(SingleHref(m.Item(i).Value), u.Address), u) Then
                            usearch = New UNode(u.Sn, Sn, u.Level + 1, WholeUri(SingleHref(m.Item(i).Value), u.Address), GetRss(WholeUri(SingleHref(m.Item(i).Value), u.Address)), u.Rank, u.IndexPath, u.EnCode)

                            usearch.SearchType = u.SearchType
                            usearch.Ext = u.Ext
                            usearch.TemplateId = u.TemplateId
                            usearch.Tiltle = stripHTML(m.Item(i).Groups(1).ToString)
                            Qin.Enqueue(usearch)


                        End If
                    Next
                Else

                    Qerror.Enqueue(u)

                End If

            Catch ex As Exception
                Console.Write(ex.ToString)
            End Try

        End If

    End Sub

    Function Gey(ByVal mv As String) As String
        Dim i As Integer
        i = mv.LastIndexOf("/")
        Gey = Mid(mv, i + 2, mv.Length - i - 1)
    End Function
    Function GetRss(ByVal ms As String) As Boolean
        Try
            If ms.EndsWith(".xml") Then
                GetRss = True
                Exit Function
            End If
            Dim rgx As New Regex("((/w)*)(rss|feed)(/.)(asp|aspx|htm|html|shtml|php|jsp)")
            If rgx.IsMatch(Gey(ms)) Then
                GetRss = True
            Else
                GetRss = False
            End If

        Catch ex As Exception

        End Try

    End Function
End Class

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值