GetPageHtml.aspx
<%@ Page language="c#" validateRequest = "false" Codebehind="GetPageHtml.aspx.cs" AutoEventWireup="false" Inherits="eMeng.Exam.GetPageHtml" %> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" > <HTML> <HEAD> <title>得到网页源代码</title> <meta name="GENERATOR" Content="Microsoft Visual Studio 7.0"> <meta name="CODE_LANGUAGE" Content="C#"> <meta name="vs_defaultClientScript" content="JavaScript"> <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5"> </HEAD> <body MS_POSITIONING="GridLayout"> <form id="aspNetBuffer" method="post" runat="server"> <div align="center" style="FONT-WEIGHT: bold">得到任意网页源代码</div> <asp:TextBox id="UrlText" runat="server" Width="400px">http://dotnet.aspx.cc/content.aspx </asp:TextBox> <asp:Button id="WebClientButton" Runat="server" Text="用WebClient得到"></asp:Button> <asp:Button id="WebRequestButton" runat="server" Text="用WebRequest得到"></asp:Button> <br> <asp:TextBox id="ContentHtml" runat="server" Width="100%" Height="360px" TextMode="MultiLine"> </asp:TextBox> </form> </body> </HTML>
GetPageHtml.aspx.cs
using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace eMeng.Exam
{
/// <summary>
/// GetPageHtml 的摘要说明。
/// </summary>
public class GetPageHtml : System.Web.UI.Page
{
protected System.Web.UI.WebControls.Button WebClientButton;
protected System.Web.UI.WebControls.Button WebRequestButton;
protected System.Web.UI.WebControls.TextBox ContentHtml;
protected System.Web.UI.WebControls.TextBox UrlText;
protected System.Web.UI.WebControls.Button GetText;
private string PageUrl = "";
private void Page_Load(object sender, System.EventArgs e)
{}
#region Web Form Designer generated code
override protected void OnInit(EventArgs e)
{
InitializeComponent();
base.OnInit(e);
}
/// <summary>
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
/// </summary>
private void InitializeComponent()
{
this.WebClientButton.Click += new System.EventHandler(this.WebClientButton_Click);
this.WebRequestButton.Click += new System.EventHandler(this.WebRequestButton_Click);
this.GetText.Click += new System.EventHandler(this.GetText_Click);
this.Load += new System.EventHandler(this.Page_Load);
}
#endregion
private void WebClientButton_Click(object sender, System.EventArgs e)
{
PageUrl = UrlText.Text;
WebClient wc = new WebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
///方法一:
Byte[] pageData = wc.DownloadData(PageUrl);
ContentHtml.Text = Encoding.Default.GetString(pageData);
/// 方法二:
/// ***************代码开始**********
/// Stream resStream = wc.OpenRead(PageUrl);
/// StreamReader sr = new StreamReader(resStream,System.Text.Encoding.Default);
/// ContentHtml.Text = sr.ReadToEnd();
/// resStream.Close();
/// **************代码结束********
///
wc.Dispose();
}
private void WebRequestButton_Click(object sender, System.EventArgs e)
{
PageUrl = UrlText.Text;
WebRequest request = WebRequest.Create(PageUrl);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
ContentHtml.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
}
private void GetText_Click(object sender, System.EventArgs e)
{
PageUrl = UrlText.Text;
WebRequest request = WebRequest.Create(PageUrl);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
ContentHtml.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
ContentHtml.Text = Regex.Replace(ContentHtml.Text,"<[^>]*>", "");
//替换空格
ContentHtml.Text = Regex.Replace(ContentHtml.Text,"//s+", " ");
}
}
}
某些网站的修正代码
string PageUrl = "http://www.blogcn.com/rss2/06sg.xml";
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(PageUrl);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, System.Text.Encoding.Default);
ContentHtml.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
string PageUrl = "http://www.blogcn.com/rss2/06sg.xml";
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(PageUrl);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.1) Web-Sniffer/1.0.24";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, System.Text.Encoding.Default);
ContentHtml.Text = sr.ReadToEnd();
resStream.Close();
sr.Close();
关于协议冲突:在ASP.NET 2.0里使用web.config设置:
<system.net>
<settings>
<httpWebRequest useUnsafeHeaderParsing = "true"/>
</settings>
</system.net>
<system.net>
<settings>
<httpWebRequest useUnsafeHeaderParsing = "true"/>
</settings>
</system.net>
本文介绍了一种使用ASP.NET实现的简单方法来获取指定URL的网页源代码。通过两种方式实现:利用WebClient类和WebRequest类。同时展示了如何解析并显示这些内容。
2336

被折叠的 条评论
为什么被折叠?



