网页抓取无外乎就是两种POST和GET方式。需要安装httpwatch+professional插件。
/// <summary>
/// POST 方式下载网页
/// </summary>
/// <param name="url">网站地址</param>
/// <param name="encode">网站解码方式</param>
/// <param name="param">访问网站所需要的参数</param>
/// <returns>网站的HTML数据</returns>
public static string DoPost(string url, IDictionary<string, string> param, Encoding encode)
{
StringBuilder paramBuilder = new StringBuilder();
if (param != null)
{
foreach (string key in param.Keys)
{
if (paramBuilder.Length == 0)
{
paramBuilder.AppendFormat("{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
else
{
paramBuilder.AppendFormat("&{0}={1}", HttpUtility.HtmlEncode(key), HttpUtility.HtmlEncode(param[key]));
}
}
}
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.ContentType = "application/x-www-form-urlencoded";
request.Method = "POST";
byte[] postData = null;
postData = encode.GetBytes(paramBuilder.ToString());
request.ContentLength = postData.Length;
using (Stream newStream = request.GetRequestStream())
{
newStream.Write(postData, 0, postData.Length);
}
string html;
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
using (System.IO.Stream responseStream = response.GetResponseStream())
{
System.IO.StreamReader reader = new System.IO.StreamReader(responseStream, encode);
html = reader.ReadToEnd();
}
return html;
}
强调上述方法中param的参数设置。所传值必须与插件的stream中内容显示的一致,否则会出现无法连接的异常。
提前分析网页主要就是确认传进去的编码格式与传出来的编码格式。