’线程池中每个线程对应一个初始的地址,抓到有用的信息后入队
Imports System.Text.RegularExpressions
Imports System.Text
Imports System.Data.SqlClient
Imports System.Data
Imports System.Threading
Class VisitObject
Dim Qin As Queue
Dim Qfin As Queue
Dim Qdirectory As New Queue(20, 1.5)
Dim MacCount As Integer
Dim mStep As Integer
Dim mflag As Integer
Dim minitnode As UNode
Dim usefulext As String
Function GetFirstElement(ByVal q As Queue) As UNode
GetFirstElement = CType(q.Peek, UNode)
End Function
Sub New(ByVal IC As Integer, ByVal flag As Integer, ByVal initnode As UNode)
MacCount = IC
mflag = flag
minitnode = initnode
Console.WriteLine(GcountinitWorkEvernt)
Qfin = New Queue(MacCount)
Qin = New Queue(MacCount)
usefulext = initnode.Ext
mStep = initnode.InitStep
End Sub
Sub Vall(ByVal uni As Object)
SeachPage(CType(uni, UNode))
While Qfin.Count > 0
AddItemFromread(CType(Qfin.Dequeue, UNode))
End While
DWork(mflag) = True
If CheckTaskDone() Then
DoneEvent.Set()
End If
End Sub
Function SeachPage(ByVal un As UNode) As Boolean
Dim i As Integer
Dim temp As String
For i = 0 To un.MaxDeepLevel Step mStep
‘模拟得到下一页的信息
temp = Trim(un.Address).Replace("<n>", i)
'Sub New(ByVal psn As Integer, ByVal sn As Integer, ByVal lel As Integer, ByVal uri As String, ByVal rssflag As Boolean, ByVal boost As Integer, ByVal utype As String, Optional ByVal initmencode As String = "gb2312")
Dim unsilbiming As New UNode(-1, Sn, 0, temp, False, un.Rank, "Brother", un.EnCode)
unsilbiming.SearchType = un.SearchType
unsilbiming.Rank = un.Rank
unsilbiming.Ext = un.Ext
’对于不同的地址在得到信息的时候,使用不同的模板
unsilbiming.TemplateId = un.TemplateId
Qin.Enqueue(unsilbiming)
Next
While (Qin.Count <> 0)
If Qfin.Count <= MacCount Then
Try
visit(GetFirstElement(Qin), 0)
Qfin.Enqueue(Qin.Dequeue())
Catch ex As Exception
End Try
Else
Exit While
End If
End While
End Function
Function CkeckNode(ByVal adr As String, ByVal unode As UNode) As Boolean
Try
If Trim(unode.SearchType) = "W" Then
If adr.IndexOf(unode.Ext) >= 5 Then
CkeckNode = True
Exit Function
Else
CkeckNode = False
End If
Exit Function
Else
If GetRss(adr) Then
CkeckNode = True
Exit Function
Else
CkeckNode = False
Exit Function
End If
End If
Catch ex As Exception
Console.WriteLine(ex.ToString)
CkeckNode = False
Exit Function
End Try
End Function
Function CheckTaskDone() As Boolean
Dim i As Integer
For i = 0 To GcountinitWorkEvernt
If DWork(i) = False Then
CheckTaskDone = False
Exit Function
End If
Next
CheckTaskDone = True
End Function
Function SingleHref(ByVal inputString As String) As String
Dim r As Regex
Dim m As Match
inputString = inputString.Replace("'", "")
r = New Regex("href/s*=/s*(""([^""]*)""|(/S+))", _
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
m = r.Match(inputString)
If m.Groups.Count >= 1 Then
SingleHref = m.Groups(1).ToString.Replace(Chr(34), "")
Else
SingleHref = ""
End If
End Function
Function SingleHref2(ByVal inputString As String) As String
Dim r As Regex
Dim m As Match
r = New Regex("href/s*=/s*('([^""]*)')", _
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
m = r.Match(inputString)
If m.Groups.Count >= 1 Then
SingleHref2 = m.Groups(1).ToString.Replace(Chr(34), "")
Else
SingleHref2 = ""
End If
End Function
Function WholeUri(ByVal i As String, ByVal BasicPath As String) As String
If i.IndexOf("javascript") > -1 Then
WholeUri = ""
Exit Function
End If
If i.StartsWith("http://") Then
WholeUri = i.Replace("amp;", "")
Console.WriteLine("xxxxx " & i)
Else
If i.StartsWith("/") Then
Dim ih As Integer
ih = BasicPath.IndexOf("/", 10)
WholeUri = Mid(BasicPath, 1, ih) & i
Else
WholeUri = Directo(BasicPath) & i
End If
End If
End Function
Function Directo(ByVal BasicPath As String) As String
Dim tempds As String
Dim i As Integer
i = BasicPath.LastIndexOf("/")
If i > 6 Then
Directo = Mid(BasicPath, 1, i + 1)
Else
Directo = BasicPath
End If
End Function
Sub visit(ByRef u As UNode, ByVal trytime As Integer)
Dim Retry As Integer
Retry = 0
If Trim(u.Address) <> "" Then
Try
'Sub New(ByRef u As UNode, ByVal flag As String, ByVal Tempid As Integer, Optional ByVal mcide As String = "gb2312")
Dim mgt As New ClientGetAsync(u, u.Address, u.TemplateId)
Try
'中储信息列表,如果信息不够就回传
mgt.GMain(u.Address)
While mgt.Von = "" And Retry < 6
mgt.GMain(u.Address)
Retry = Retry + 1
Console.WriteLine("retry {0} ", Retry)
End While
Try
u.Picture = mgt.Picture
u.MetaValue = mgt.MyMetaValue
u.PagetitleValue = mgt.MyPageTile
u.Price = mgt.Price
u.Boost = mgt.Boost
u.Addtion = mgt.Addtion
u.Content = mgt.CText
Catch ex As Exception
Console.Write(ex.ToString)
End Try
Catch ex As Exception
Console.Write(ex.ToString)
Qerror.Enqueue(u)
End Try
Dim r As Regex
Dim m As MatchCollection
Dim hrefs As New StringBuilder("")
r = New Regex("<a[^<>]+?/>((.|/n)*?)<//a>", _
RegexOptions.IgnoreCase Or RegexOptions.Compiled)
If Trim(mgt.Von) <> "" Then
m = r.Matches(mgt.Von)
Dim i As Integer
For i = 0 To m.Count - 1
'Sub New(ByVal psn As Integer, ByVal sn As Integer, ByVal lel As Integer, ByVal uri As String, ByVal rssflag As Boolean, ByVal boost As Integer, ByVal utype As String, Optional ByVal initmencode As String = "gb2312")
Interlocked.Increment(Sn)
Dim usearch As UNode
If CkeckNode(WholeUri(SingleHref(m.Item(i).Value), u.Address), u) Then
usearch = New UNode(u.Sn, Sn, u.Level + 1, WholeUri(SingleHref(m.Item(i).Value), u.Address), GetRss(WholeUri(SingleHref(m.Item(i).Value), u.Address)), u.Rank, u.IndexPath, u.EnCode)
usearch.SearchType = u.SearchType
usearch.Ext = u.Ext
usearch.TemplateId = u.TemplateId
usearch.Tiltle = stripHTML(m.Item(i).Groups(1).ToString)
Qin.Enqueue(usearch)
End If
Next
Else
Qerror.Enqueue(u)
End If
Catch ex As Exception
Console.Write(ex.ToString)
End Try
End If
End Sub
Function Gey(ByVal mv As String) As String
Dim i As Integer
i = mv.LastIndexOf("/")
Gey = Mid(mv, i + 2, mv.Length - i - 1)
End Function
Function GetRss(ByVal ms As String) As Boolean
Try
If ms.EndsWith(".xml") Then
GetRss = True
Exit Function
End If
Dim rgx As New Regex("((/w)*)(rss|feed)(/.)(asp|aspx|htm|html|shtml|php|jsp)")
If rgx.IsMatch(Gey(ms)) Then
GetRss = True
Else
GetRss = False
End If
Catch ex As Exception
End Try
End Function
End Class