采集网页的代码我是这么写的:
string rePageInfo=String.Empty;
Stream responseStream=null;
StreamReader sr=null;
try
{
HttpWebRequest myWebRequest=(HttpWebRequest)WebRequest.Create(@urlstr);
myWebRequest.Timeout=1000;
myWebRequest.Method= "GET ";
HttpWebResponse res=(HttpWebResponse)myWebRequest.GetResponse();
if(res.StatusCode==HttpStatusCode.OK)
{
responseStream=res.GetResponseStream();
sr=new StreamReader(responseStream,System.Text.Encoding.Default);
rePageInfo=sr.ReadToEnd();
}
res=null;
myWebRequest=null;
}
catch
{
}
finally
{
//释放资源的代码
......
}
采集到网页后进行分析
现在问题是,当采集的页面使用utf-8、gb2312、big5等不同的编码时,采集程序要么采集的是一片空白(没东西),要么是乱码
然后我在读取流的时候加了个识别代码,如下:
if(res.StatusCode==HttpStatusCode.OK)
{
string conntype=res.Headers[ "Content-Type "];
//提取 <meta http-equiv= "Content-Type " content= "text/html; charset=GB2312 " /> 中的content的值,并进一步提取charset的值,并赋值给conntype
responseStream=res.GetResponseStream();
sr=new StreamReader(responseStream,System.Text.Encoding.GetEncoding(conntype.Trim()));
rePageInfo=sr.ReadToEnd();
......
......
但这样,好象是可以同时采集utf-8和gb2312的网页了,但却不稳定,有的能行,可有的又不行!!!! 尤其是有的服务器如果是unix或linux等的时候,采集到的数据的换行符都成小黑块了
还有就是 res.Headers[ "Content-Type "]获取的值很多都是只有“text/html”,而没有后边的charset=.+
请问哪位高手有这方面的经验呀?采集网页的时候怎么进行页面的代码转换呢
/// <summary>
/// Download a page
/// </summary>
/// <returns> The data downloaded from the page </returns>
private string GetPage()
{
WebResponse response = null;
Stream stream = null;
StreamReader
reader = null;
//StreamReader reader = null;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
response = request.GetResponse();
stream = response.GetResponseStream();
if (!response.ContentType.ToLower().StartsWith( "text/ "))
{
SaveBinaryFile(response);
return null;
}
reader = new StreamReader(stream, System.Text.Encoding.Default);
string buffer = " ", line;
// reader = new StreamReader(stream);
while ((line = reader.ReadLine()) != null)
{
buffer += line + "/r/n ";
}
string strEncoding = Tools.GetEncoding(buffer);
//string strEncoding = "UNICODE ";
//System.Text.Encoding encoding;
if (strEncoding == "UTF-8 ")
encoding = Encoding.UTF8;
else if (strEncoding == "UTF-7 ")
encoding = Encoding.UTF7;
else if (strEncoding == "UNICODE ")
encoding = Encoding.Unicode;
else
encoding = Encoding.Default;
//encoding = Encoding.Unicode;
//Encoding
// encoding=
//encoding = Encoding.ASCII;
if (encoding != Encoding.Default)
{
request.Timeout = 60000;
request = (HttpWebRequest)WebRequest.Create(m_uri);
response = request.GetResponse();
stream = response.GetResponseStream();
reader = new StreamReader(stream, encoding);
buffer = reader.ReadToEnd();
}
//SaveTextFile(buffer);
reader.Close();
stream.Close();
response.Close();
return buffer;
}
catch (WebException e)
{
System.Console.WriteLine( "Can 't download: " + e);
return null;
}
catch (IOException e)
{
System.Console.WriteLine( "Can 't download: " + e);
return null;
}
finally
{
if (reader != null)
reader.Close();
if (stream != null)
stream.Close();
if (response != null)
response.Close();
}
}
/// <summary>
/// 获取HTML文件编码
/// </summary>
/// <param name= "inputString "> HTML文件 </param>
/// <returns> </returns>
public static string GetEncoding(string inputString)
{
Regex r = new Regex( "charset//s*=//s*(?:/ "(? <1> [^/ "]*)/ "|(? <1> //S+)) ",
RegexOptions.IgnoreCase|RegexOptions.Compiled);
Match m = r.Match(inputString);
return m.Groups[1].Value.Replace( "/ " ", " ").Replace( "> ", " ").ToUpper();
}