自己写的采集

本文介绍了一种使用C#实现的网页资源抓取及下载的方法。通过输入目标URL,程序能够解析网页内容,提取所需的链接,并下载指定类型的文件如Flash资源等。文章详细展示了如何构造请求、处理响应及文件保存的全过程。
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;

using System.IO;
using System.Net;
using System.Text;
using Dang.Utils;

namespace MyTest.CaiJi
{
    
public partial class caiji01 : System.Web.UI.Page
    
{
        
public static string reAllListHtml,reAllHtml,htmlAddress,flashAddress;
        
public static string RelativeWay, RootWay, htmlListLu;
        
public static ArrayList alist;//地址列表
        public static int getSavedNum;
        
protected void Page_Load(object sender, EventArgs e)
        
{
            getSavedNum 
= 0;
        }


        
protected void Button1_Click(object sender, EventArgs e)
        
{
            htmlAddress 
= TextBox1.Text.Trim().ToLower();

            RelativeWay 
= htmlAddress.Substring(0, htmlAddress.LastIndexOf("/")) + "/";//相对路径
            RootWay = htmlAddress.Substring(0, htmlAddress.IndexOf("/"7)) + "/";//根路径
            
            reAllListHtml 
= GetPageHTML(htmlAddress);
            Panel1.Visible 
= true;
        }

        

        
protected void Button3_Click(object sender, EventArgs e)
        
{
            
string starstr = TextBox2.Text.Trim();
            
string endstr = TextBox3.Text.Trim();
            alist 
= Get_fileAddresss(starstr, endstr, reAllListHtml);
            Panel1.Visible 
= false;
            StringDo.Visible 
= true;
        }


        
protected void Button2_Click(object sender, EventArgs e)
        
{
            
for (int i = 0; i < alist.Count; i++)
            
{
                getflash(alist[i].ToString());
            }
   
            
        }


        
protected void Go_Click(object sender, EventArgs e)
        
{
            
for (int i = 0; i < alist.Count; i++)
            
{
                getflash(alist[i].ToString());
            }
   
        }



        
public void getflash(string url)
        
{
            reAllHtml 
= GetPageHTML(url);
            
string starstr = StartUrlString.Text.Trim();
            
string endstr = EndUrlString.Text.Trim();
            flashAddress 
= Get_fileAddress(starstr, endstr, reAllHtml);
            
bool isSave = false;
            
if (flashAddress != null)
            
{
                flashAddress 
= Tohttp(flashAddress);
                isSave 
= SaveFileFromUrl(flashAddress);
            }
           

            
            
if (isSave)
                Label1.Text 
+= url + "成功!<br />";
            
else
                Label1.Text 
+= url + "失败!<br />";
            getSavedNum 
+= 1;
        }



        
public string Tohttp(string str)
        
{
            
if (str.StartsWith("/"))
                str 
= RootWay + str;
            
if (!str.StartsWith("http://"))
                str 
= RelativeWay + str;
            
return str;
        }

        
public string Get_fileAddress(string startstr,string endstr,string strResult)
        
{
            String temp 
= "";
            
int start, stop;

            start 
= strResult.IndexOf(startstr, 0, strResult.Length);
            stop 
= strResult.IndexOf(endstr, 0, strResult.Length);
            
if (start == -1 || stop == -1)
                
return null;
            temp 
= strResult.Substring(start+startstr.Length, stop-start-startstr.Length);
            
return temp;
        }


        
public ArrayList Get_fileAddresss(string startstr, string endstr, string strResult)
        
{
            ArrayList list 
= new ArrayList();
            
int start = 0, stop = 0;
            
while (start != -1 && stop!=-1)
            
{
                start 
= strResult.IndexOf(startstr, 0, strResult.Length);
                
if (start == -1)
                    
break;
                strResult 
= strResult.Substring(start + startstr.Length);
                stop 
= strResult.IndexOf(endstr, 0, strResult.Length);
                
if ( stop == -1)
                    
break;
                
string tempaddress = strResult.Substring(0, stop);
                tempaddress 
= Tohttp(tempaddress);
                list.Add(tempaddress);
                strResult 
= strResult.Substring(stop);
            }
            

            
return list;
        }



        
/// <summary>
        
/// 从文件地址下载文件到本地磁盘
        
/// </summary>
        
/// <param name="Url">文件网址</param>
        
/// <returns></returns>        

        public  bool SaveFileFromUrl(string Url)
        
{
            
if (Url.IndexOf("."== -1)
                
return false;
            
string fileExt = Url.Substring(Url.LastIndexOf("."+ 1);
            
bool Value = false;
            WebResponse response 
= null;
            Stream stream 
= null;

            
try
            
{
                HttpWebRequest request 
= (HttpWebRequest)WebRequest.Create(Url);

                response 
= request.GetResponse();
                stream 
= response.GetResponseStream();

                
if (!response.ContentType.ToLower().StartsWith("text/"))
                
{
                    Value 
= SaveBinaryFile("flashsrc", fileExt, response);
                }


            }

            
catch (Exception err)
            
{
                
string aa = err.ToString();
            }

            
return Value;
        }

       
        
/// <summary>
        
///  将二进制文件保存到磁盘
        
/// </summary>
        
/// <param name="fileDirectory">保存的目录flashsrc</param>
        
/// <param name="fileNameExt">保存的类型</param>
        
/// <param name="response">网络响应</param>
        
/// <returns></returns>

        private  bool SaveBinaryFile(string fileDirectory,string fileNameExt,WebResponse response)
        
{
            
bool Value = true;
            
byte[] buffer = new byte[1024];
            
string dirpath = Server.MapPath("/"+fileDirectory+"/");
            
if (Directory.Exists(dirpath) == false)
            
{
                Directory.CreateDirectory(dirpath);
            }

            
try
            
{
                
string FileName = dirpath + GetUniquelyString() + "." + fileNameExt;
                
if (File.Exists(FileName))
                    File.Delete(FileName);
                Stream outStream 
= System.IO.File.Create(FileName);
                Stream inStream 
= response.GetResponseStream();

                
int l;
                
do
                
{
                    l 
= inStream.Read(buffer, 0, buffer.Length);
                    
if (l > 0)
                        outStream.Write(buffer, 
0, l);
                }

                
while (l > 0);

                outStream.Close();
                inStream.Close();
            }

            
catch
            
{
                Value 
= false;
            }

            
return Value;
        }


        
/// <summary>
        
///  获取一个不重复的文件名
        
/// </summary>
        
/// <returns></returns>

        public static string GetUniquelyString()
        
{
            
const int RANDOM_MAX_VALUE = 1000;
            
string strTemp, strYear, strMonth, strDay, strHour, strMinute, strSecond, strMillisecond;
            Random rnd 
= new Random();
            DateTime dt 
= DateTime.Now;
            
int rndNumber = rnd.Next(RANDOM_MAX_VALUE);
            strYear 
= YieldRandNum(5);
            strMonth 
= (dt.Month > 9? dt.Month.ToString() : "i" + dt.Month.ToString();
            strDay 
= (dt.Day > 9? dt.Day.ToString() : "a" + dt.Day.ToString();
            strHour 
= (dt.Hour > 9? dt.Hour.ToString() : "n" + dt.Hour.ToString();
            strMinute 
= (dt.Minute > 9? dt.Minute.ToString() : "j" + dt.Minute.ToString();
            strSecond 
= (dt.Second > 9? dt.Second.ToString() : "n" + dt.Second.ToString();
            strMillisecond 
= dt.Millisecond.ToString();

            strTemp 
= strYear + strDay + strMonth + strHour + strMinute + strSecond + strMillisecond + rndNumber.ToString();
            strTemp 
= strTemp.Replace("1""q");

            
return strTemp;

        }


        
/// <summary>
        
/// 产生随机字母
        
/// </summary>
        
/// <param name="d"></param>
        
/// <returns></returns>

        public static string YieldRandNum(int d)
        
{
            
char[] seed ='A''b''B''c''C''d''D''e''E''f''F''G''h''H''i''j''J''k''K''L''m''M''n''N''p''P''q''Q''R''s''S''t''T''u''U''v''V''w''W''x''X''y''Y''z''Z' };
            
int seed_count = seed.Length;
            Random rand 
= new Random();
            StringBuilder sb 
= new StringBuilder(4);
            
for (int i = 0; i < d; i++)
                sb.Append(seed[rand.Next(
0, seed_count)]);
            
return sb.ToString();
        }


        
/// <summary>
        
///  获取给定Url PageHtml
        
/// </summary>
        
/// <param name="url">Url</param>
        
/// <returns>PageHtml</returns>

        public static string GetPageHTML(string url)
        
{
            WebRequest request 
= null;
            HttpWebResponse response 
= null;
            Stream stream 
= null;
            StreamReader sr 
= null;

            
try
            
{
                request 
= WebRequest.Create(url);
                request.Credentials 
= CredentialCache.DefaultCredentials;
                request.Timeout 
= 2000;
                response 
= request.GetResponse() as HttpWebResponse;

                stream 
= response.GetResponseStream();
                sr 
= new StreamReader(stream, Encoding.Default);
                
return sr.ReadToEnd();
            }

            
catch
            
{
                
return string.Empty;
            }

            
finally
            
{
                
if (sr != null)
                
{
                    sr.Close();
                    sr.Dispose();
                }


                
if (stream != null)
                
{
                    stream.Close();
                    stream.Dispose();
                }

            }

        }


       
        

       

    }


}



<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="caiji01.aspx.cs" Inherits="MyTest.CaiJi.caiji01" validateRequest="false" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
    
<title>无标题页</title>
    
<script type="text/javascript">
        
var xmlHttp;
        
var key;
        
function createXMLHttpRequest() {
            
if (window.ActiveXObject) {
                xmlHttp 
= new ActiveXObject("Microsoft.XMLHTTP");
            }
 
            
else if (window.XMLHttpRequest) {
                xmlHttp 
= new XMLHttpRequest();                
            }

        }

        
        
function pollServer() 
            createXMLHttpRequest();
            data 
= "load="+"";
            
var url = "Loading.aspx";
            xmlHttp.open(
"POST", url, true);
            xmlHttp.setRequestHeader(
"Content-Type","application/x-www-form-urlencoded");
            xmlHttp.onreadystatechange 
= pollCallback;
            xmlHttp.send(data);
        }


        
function pollCallback() {
            
if (xmlHttp.readyState == 4{
                
if (xmlHttp.status == 200{
                    
var percent_complete = xmlHttp.responseText;
                    
var progress = document.getElementById("progress");
                    
var progressPersent = document.getElementById("progressPersent");
                     progress.style.width 
= percent_complete + "%";
                     progressPersent.innerHTML 
= percent_complete + "%";
                    
if (percent_complete < 100{
                        setTimeout(
"pollServer()"2000);
                    }
 else {
                        document.getElementById(
"complete").innerHTML = "已生成完成!";

                    }

                }

            }

        }
 
        
           
 
function clearBar() {
   
var progress_bar = document.getElementById("progressBar");
   
var progressPersent = document.getElementById("progressPersent");
   
var complete = document.getElementById("complete");
   
//progress_bar.style.visibility = "visible"
   progressPersent.innerHTML = " ";
   complete.innerHTML 
= "开始生成!";
 }

 
function next()
 
{    
    pollServer();
    __doPostBack(
'Go','');
    
return  false;
 }

    
</script>


</head>
<body>
    
<form id="form1" runat="server">
    
<div>结果:<asp:Label
            
ID="Label1" runat="server" Text=""></asp:Label><br />
            
        获取列表页面:
<asp:TextBox ID="TextBox1" runat="server"></asp:TextBox>
        
<asp:Button ID="Button1" runat="server" Text="下一步" OnClick="Button1_Click" />
        
<asp:Panel runat="server" ID="Panel1" Visible="false">
        开始循环标记:
<asp:TextBox Text="" TextMode="MultiLine" runat="server" ID="TextBox2"></asp:TextBox><br />
        结束循环结束:
<asp:TextBox Text="" TextMode="MultiLine" runat="server" ID="TextBox3"></asp:TextBox><br />
        
<asp:Button ID="Button3" runat="server" Text="下一步" OnClick="Button3_Click"  />
        
</asp:Panel>
        
<asp:Panel runat="server" ID="StringDo" Visible="false">
        开始标记:
<asp:TextBox Text="" TextMode="MultiLine" runat="server" ID="StartUrlString"></asp:TextBox><br />
        结束标记:
<asp:TextBox Text="" TextMode="MultiLine" runat="server" ID="EndUrlString"></asp:TextBox><br />
        
<href="javascript:next();">下一步</a><asp:Button ID="Button2" runat="server" Text="下一步" OnClick="Button2_Click" />
        
<asp:LinkButton ID="Go" runat="server" Text="生成" OnClick="Go_Click"></asp:LinkButton>
        
</asp:Panel>
    
</div>
    
<div id="progressBar" style="padding:0px;border:solid black 0px;visibility:hidden">
<table width="300" border="0" cellspacing="0" cellpadding="0"  align="center" >
  
<tr>
    
<td align="center" id="progressPersent" >0%</td>
  
</tr>
  
<tr >
    
<td>
 
<table width="100%" border="1" cellspacing="0" cellpadding="0" bordercolor="#000000">
  
<tr>
    
<td>
 
<table width="1%" border="0" cellspacing="0" cellpadding="0" bgcolor="#FF0000" id="progress" height="20">
              
<tr>
                
<td> </td>
              
</tr>
            
</table></td>
  
</tr>
</table>
</td>
  
</tr>
  
<tr>
    
<td align="center" id="complete"></td>
  
</tr>
</table>
</div>

    
</form>
    
<script type="text/javascript">
    clearBar();
    
</script>
</body>
</html>

 
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值