根据http协议原理,客户端发送请求报文,服务器端接受到报文请求后返回响应报文,响应报文里就存放着我们需要信息。(详细的原理请自行查找相关文件)
因此,我们只需要用报文分析软件,分析报文内容,再利用程序伪造发送请求报文,接收响应报文就可以达到后台自动抓取某网站数据的目的。
.net提供了相关的报文类 HttpWebRequest 和 HttpWebResponse,利用这两个类就能完成相关程序,很简单吧。
下面我写的一个自动上某网站下载图片的demo程序,就像一条虫子,在某网站不断攀爬,吞吃数据,所以起名叫爬虫程序,写的比较粗糙,只作学习理解用。
自己封装的报文收发类
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Web;
namespace HttpWeb
{
public class Proxy
{
private CookieContainer m_cookieContainer = null;
private HttpWebRequest request = null;
private HttpWebResponse response = null;
//保存回应报文
public HttpWebResponse Response
{
get { return response; }
}
public Proxy()
{
m_cookieContainer = new CookieContainer();
}
//向val_url地址post报文,val_poststr 必须是已encode后的字符串
public void PostHttp(Uri val_url, string val_PostStr)
{
request = (HttpWebRequest)HttpWebRequest.Create(val_url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "POST";
byte[] byteRequest = Encoding.Default.GetBytes(val_PostStr);
request.ContentLength = byteRequest.Length;
request.CookieContainer = m_cookieContainer;
Stream stream = request.GetRequestStream();
stream.Write(byteRequest, 0, byteRequest.Length);
stream.Close();
response = (HttpWebResponse)request.GetResponse();
m_cookieContainer.Add(response.Cookies);
}
//向val_url地址post报文,val_poststr 是未encode字符串,enc 为encode编码
public void PostHttp(Uri val_url, string val_PostStr, Encoding enc)
{
request = (HttpWebRequest)HttpWebRequest.Create(val_url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "POST";
string encPostStr = EncodeStr(val_PostStr, enc);
byte[] byteRequest = Encoding.Default.GetBytes(encPostStr);
request.ContentLength = byteRequest.Length;
request.CookieContainer = m_cookieContainer;
Stream stream = request.GetRequestStream();
stream.Write(byteRequest, 0, byteRequest.Length);
stream.Close();
response = (HttpWebResponse)request.GetResponse();
m_cookieContainer.Add(response.Cookies);
}
private string EncodeStr(string val_PostStr, Encoding enc)
{
string retn = string.Empty;
string[] strArr = val_PostStr.Split('&');
foreach (string str in strArr)
{
string[] strArr1 = str.Split('=');
retn += strArr1[0] + "=" + HttpUtility.UrlEncode(strArr1[1],enc) + "&";
}
retn = retn.Substring(0, retn.Length - 1);
return retn;
}
public void ReponseClose()
{
response.Close();
}
//向val_url地址发送get报文
public void GetHttp(Uri val_url)
{
request = (HttpWebRequest)HttpWebRequest.Create(val_url);
request.Method = "GET";
request.Accept = "*/*";
request.CookieContainer = m_cookieContainer;
response = (HttpWebResponse)request.GetResponse();
m_cookieContainer.Add(response.Cookies);
}
}
}
图片下载demo程序代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HttpWeb;
using System.IO;
using System.Text.RegularExpressions;
namespace demo
{
public class DownImage
{
private Proxy px = new Proxy();
private string url = @"http://www.quanjing.com/GetImage.ashx?q=%E8%87%AA%E7%84%B6%E7%95%8C%7C%7C1%7C100%7C1%7C2%7C%7C%7C%7C&Fr=1&CEFlag=1&size=&sortFlag=&isScroll=0&_=1310026216505";
private Regex reUrl = new Regex("lowsrc=\"(.+?)\"");
private string path = @"D:\Image";
//取图片地址 保存到一个list结构里
private List<string> getUrl()
{
List<string> retnList = new List<string>();
Uri urlUri = new Uri(url);
px.GetHttp(urlUri);
StreamReader sr = new StreamReader(px.Response.GetResponseStream());
string pageStr = sr.ReadToEnd();
sr.Close();
px.ReponseClose();
Match maUrl = reUrl.Match(pageStr);
while (maUrl.Success)
{
string urlStr = maUrl.Groups[1].Value.ToString().Trim();
retnList.Add(urlStr);
maUrl = maUrl.NextMatch();
}
return retnList;
}
//下载图片
private void downLoad()
{
List<string> urlList = getUrl();
foreach (string imageUrl in urlList)
{
Uri imageUri = new Uri(imageUrl);
Regex reFileName = new Regex(".+/(.+)");
Match maFileName = reFileName.Match(imageUrl);
string fileName;
if (maFileName.Success)
{
fileName = maFileName.Groups[1].Value.ToString().Trim();
}
else
{
fileName = Guid.NewGuid().ToString();
}
px.GetHttp(imageUri);
Stream sr = px.Response.GetResponseStream();
int bufferSize = 2048;
int readCount;
byte[] buffer = new byte[bufferSize];
readCount = sr.Read(buffer, 0, bufferSize);
FileStream fs = new FileStream(path + "/" + fileName, FileMode.Create);
while (readCount > 0)
{
fs.Write(buffer, 0, readCount);
readCount = sr.Read(buffer, 0, bufferSize);
}
sr.Close();
fs.Close();
px.ReponseClose();
}
}
public void run()
{
downLoad();
}
}
}