/// <summary>
/// 生成HTML版本.
/// </summary>
/// <param name="Url">生成地址</param>
public void GetRemoteHtmlCode(string filepath, string url)
{
WebClient myWebClient = new WebClient();
myWebClient.Credentials = CredentialCache.DefaultCredentials;
byte[] pagedata = myWebClient.DownloadData(url);
string myDataBuffer = Encoding.UTF8.GetString(pagedata);
//去掉viewstate
//string pattern = @"<input type=""hidden"" name=""__VIEWSTATE"" id=[^>](?<view>[^>]*)>";
string pattern = @"<input type=""hidden"" name=""__VIEWSTATE"" id=[^>][^>]*>";
System.Text.RegularExpressions.Regex reg = new System.Text.RegularExpressions.Regex(pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
myDataBuffer = reg.Replace(myDataBuffer, "");
Encoding code = Encoding.UTF8;
//写文件
StreamWriter sw = null;
try
{
sw = new StreamWriter(filepath, false, code);
sw.WriteLine(myDataBuffer);
sw.Flush();
Response.Write("ok");
}
catch (Exception ex)
{
//File.Delete(path + htmlfilename);
HttpContext.Current.Response.Write(ex.Message);
HttpContext.Current.Response.End();
Response.Write("no");
}
finally
{
if (sw != null)
sw.Close();
}
}

本文介绍了一个用于抓取网页HTML代码并保存为本地文件的方法。通过使用WebClient组件下载指定URL的内容,并采用正则表达式去除不需要的部分如Viewstate,最后将处理后的HTML字符串保存到指定路径。
719

被折叠的 条评论
为什么被折叠?



