抓取网页数据

本文介绍了一个从中国四大银行网站抓取外汇牌价数据的程序,并通过网页展示这些数据。程序使用C#实现,涉及网页内容抓取、正则表达式匹配等技术。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

using System; using System.Data; using System.Configuration; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.IO; using System.Collections; public partial class _Default : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { ShowInfo(); } #region 显示信息 /// <summary> /// 构造显示信息 /// </summary> private void ShowInfo() { string sEciticUrl = "http://www.ecitic.com/paijia.jsp"; string sICBCUrl = "http://www.icbc.com.cn/ICBCDynamicSite/Optimize/Quotation/QuotationListIframe.aspx"; string sBOCUrl = "http://www.boc.cn/sourcedb/whpj/"; string sCCBUrl = "https://ibsbjstar.ccb.com.cn/app/B2CMainPlat?TXCODE=R00002"; try { string sGetEciticHtml = GetPageData(sEciticUrl, null); string sGetICBCHtml = GetPageData(sICBCUrl, null); string sGetBOCHtml = GetPageData(sBOCUrl, null); string sGetCCBHtml = GetPageData(sCCBUrl); ArrayList arylstEcitic = GetEciticData(sGetEciticHtml); //中信银行 ArrayList arylstICBC = GetICBCData(sGetICBCHtml); //工商银行 ArrayList arylstBOC = GetBOCData(sGetBOCHtml); //中国银行 ArrayList arylstCCB = GetCCBData(sGetCCBHtml); //建设银行 ArrayList arylstSum = new ArrayList(); ArrayList arylstAverage = new ArrayList(); //中间价 ArrayList arylstBuyAverage = new ArrayList(); //汇买平均价 ArrayList arylstSellAverage = new ArrayList();//汇卖平均价 for (int m = 0; m < 8; m++) { double dSum = Convert.ToDouble(arylstEcitic[m]) + Convert.ToDouble(arylstICBC[m]) + Convert.ToDouble(arylstBOC[m]) + Convert.ToDouble(arylstCCB[m]); arylstSum.Add(dSum); //将总值添加到数组 } for (int n = 0; n < 8; n++) { double dBuyAverage = Convert.ToDouble(arylstSum[n]); double dSellAverage = Convert.ToDouble(arylstSum[n + 1]); double dAverage = (dBuyAverage / 4 + dSellAverage / 4) / 2; n++; arylstBuyAverage.Add(dBuyAverage / 4); arylstSellAverage.Add(dSellAverage / 4); arylstAverage.Add(dAverage); } string[] sMoneyTypeList = new string[] { "英镑", "美元", "日元", "欧元" }; string[] sBankList = new string[] { "中信银行", "中国工商银行", "中国银行", "中国建设银行", "财务公司" }; StringBuilder sb = new StringBuilder(); sb.Append("<table cellpadding=\"0px\" cellspacing=\"0px\"><tr><td rowspan=\"2\">银行/币种</td>"); for (int j = 0; j < sMoneyTypeList.Length; j++) { sb.Append("<td colspan=\"3\" align=\"center\">" + sMoneyTypeList[j] + "</td>"); } sb.Append("</tr><tr>"); for (int x = 0; x < sMoneyTypeList.Length; x++) { sb.Append("<td>汇买价</td>"); sb.Append("<td>汇卖价</td>"); sb.Append("<td>中间价</td>"); } sb.Append("</tr>"); sb.Append(CreateTD(sBankList[0], arylstEcitic)); sb.Append(CreateTD(sBankList[1], arylstICBC)); sb.Append(CreateTD(sBankList[2], arylstBOC)); sb.Append(CreateTD(sBankList[3], arylstCCB)); sb.Append("<tr><td>" + sBankList[4] + "</td>"); for (int k = 0; k < 4; k++) { sb.Append("<td>" + arylstBuyAverage[k].ToString() + "</td>"); sb.Append("<td>" + arylstSellAverage[k].ToString() + "</td>"); sb.Append("<td>" + arylstAverage[k].ToString() + "</td>"); } sb.Append("</tr>"); sb.Append("<tr><td colspan=\"13\" style="\" mce_style="\""text-align: right;\">人民币/100外币</td></tr></table>"); Response.Write(sb.ToString()); } catch (Exception ex) { Response.Write("错误提示:" + ex.Message); } } /// <summary> /// 生成TD /// </summary> /// <param name="sMoneyType"></param> /// <param name="arylst"></param> /// <returns></returns> private string CreateTD(string sMoneyType, ArrayList arylst) { StringBuilder sb = new StringBuilder(); sb.Append("<tr><td>" + sMoneyType + "</td>"); for (int i = 0; i < 8; i++) { sb.Append("<td>" + arylst[i].ToString() + "</td>"); sb.Append("<td>" + arylst[i + 1].ToString() + "</td>"); sb.Append("<td>" + (Convert.ToDouble(arylst[i]) + Convert.ToDouble(arylst[i + 1])) / 2 + "</td>"); i++; } sb.Append("</tr>"); return sb.ToString(); } #endregion #region 辅助方法 private string GetHrefCount(string strResponse) { string strStatus = string.Empty; // 解析页面,查找链接 // 此处尚需扩展,还有某些形式的链接不被识别 string strRef = @"(href|HREF|src|SRC|action|ACTION|Action)[ ]*=[ ]*[""'][^""'#>]+[""']"; MatchCollection matches = new Regex(strRef).Matches(strResponse); return strStatus += "找到: " + matches.Count + " 个链接\r\n"; } private string GetHtmlTitle(string strResponse) { string title = string.Empty; //获取标题 Match TitleMatch = Regex.Match(strResponse, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline); return title = TitleMatch.Groups[1].Value; } private string GetMetaInfo(string strResponse) { string strdesc = string.Empty; //获取描述信息 Match Desc = Regex.Match(strResponse, "<Meta name=\"DESCRIPTION\" content=\"([^<]*)\">", RegexOptions.IgnoreCase | RegexOptions.Multiline); return strdesc = Desc.Groups[1].Value; } private string GetEncodInfo(string url) { if (url == null || url.Trim() == "") return null; WebClient wc = new WebClient();//定义 wc.Credentials = CredentialCache.DefaultCredentials; Byte[] pageData = wc.DownloadData(url); return Encoding.GetEncoding(0).GetString(pageData); } //去除Html标签 private string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "<"); strOutput = strOutput.Replace(">", ">"); return strOutput; } #endregion #region 中信银行外汇牌价 /// <summary> /// 获取中信银行外汇牌价四种币种内容 /// </summary> /// <param name="strResponse"></param> /// <param name="sMoneyType"></param> /// <returns></returns> private string GetEciticHtml(string strResponse, string sMoneyType) { string sTableHtml = string.Empty; string sResult = string.Empty; string s = string.Empty; MatchCollection mTableHtmlCollection = Regex.Matches(strResponse, "<tr class=\"01\">(.|\n)+?</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mTableHtml in mTableHtmlCollection) { sTableHtml += mTableHtml.Value + "<br/>"; } MatchCollection mCollection = Regex.Matches(sTableHtml, "<td width=\"80\" height=\"20\" class=\"command\">[" + sMoneyType + "](.|\n)+?<br/>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mHtml in mCollection) { sResult += mHtml.Value; } MatchCollection mCollections = Regex.Matches(sResult, "<td align=\"center\" class=\"command\">(.|\n)+?</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mHtmls in mCollections) { s += mHtmls.Groups[0].Value + "<br/>"; } return s; } /// <summary> /// 获取中信银行外汇牌价,数组中0、1位置为英镑汇买入、卖出,2、3为美元汇买入、卖出,4、5为日元汇买入、卖出,6、7为欧元汇买入、卖出,对应【汇买、汇卖】 /// </summary> /// <param name="strResponse"></param> /// <returns></returns> private ArrayList GetEciticData(string strResponse) { string sGetHtml = string.Empty; ArrayList arylst = new ArrayList(); string[] sMoneyTypeList = new string[] { "英镑", "美元", "日元", "欧元" }; for (int m = 0; m < sMoneyTypeList.Length; m++) { sGetHtml = GetEciticHtml(strResponse, sMoneyTypeList[m]).Replace("<td align=\"center\" class=\"command\">", "").Replace("</td><br/>", "|").TrimEnd('|'); string[] sData = sGetHtml.Split('|'); for (int n = 0; n < sData.Length; n++) { if (n != 1 && n != 4) { continue; } else { arylst.Add(sData[n]); } } } return arylst; } #endregion #region 工商银行外汇牌价 /// <summary> /// 获取工商银行外汇牌价四种币种内容 /// </summary> /// <param name="strResponse"></param> /// <param name="sMoneyType"></param> /// <returns></returns> private string GetICBCHtml(string strResponse, string sMoneyType) { string sTableHtml = string.Empty; string sResult = string.Empty; MatchCollection mTableHtmlCollection = Regex.Matches(strResponse.Replace("\t","").Replace("16%","14%").Replace(";",""), "\\b" + sMoneyType + "(.|\n)+?</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mTableHtml in mTableHtmlCollection) { sTableHtml += mTableHtml.Value; } MatchCollection mCollection = Regex.Matches(sTableHtml, "\\b14%\">(.|\n)+?</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mHtml in mCollection) { sResult += mHtml.Value; } return sResult; } /// <summary> /// 获取工商银行外汇牌价,数组中0、1为英镑汇买入、卖出,2、3为美元汇买入、卖出,4、5为日元汇买入、卖出,6、7为欧元汇买入、卖出,对应【汇买、汇卖】 /// </summary> /// <param name="strResponse"></param> /// <returns></returns> private ArrayList GetICBCData(string strResponse) { string sGetHtml = string.Empty; ArrayList arylst = new ArrayList(); string[] sMoneyTypeList = new string[] { "GBP", "USD", "JPY", "EUR" }; for (int m = 0; m < sMoneyTypeList.Length; m++) { sGetHtml = GetICBCHtml(strResponse, sMoneyTypeList[m]).Replace("14%\">", "").Replace("</td>", "|").TrimEnd('|'); string[] sData = sGetHtml.Split('|'); for (int n = 0; n < sData.Length; n++) { if (n != 1 && n != 3) { continue; } else { arylst.Add(sData[n]); } } } return arylst; } #endregion #region 中国银行外汇牌价 /// <summary> /// 获取中国银行外汇牌价四种币种内容 /// </summary> /// <param name="strResponse"></param> /// <param name="sMoneyType"></param> /// <returns></returns> private string GetBOCHtml(string strResponse, string sMoneyType) { string sTableHtml = string.Empty; string sResult = string.Empty; string s = string.Empty; MatchCollection mTableHtmlCollection = Regex.Matches(strResponse, "<tr align=\"center\">(.|\n)+?</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mTableHtml in mTableHtmlCollection) { sTableHtml += mTableHtml.Value.Replace(" ","").Replace("\n","").Replace("\t","").Trim(); } MatchCollection mCollection = Regex.Matches(sTableHtml, "<tdbgcolor=\"#FFFFFF\">[" + sMoneyType + "](.|\n)+?</td></tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mHtml in mCollection) { sResult += mHtml.Value; } return sResult; } /// <summary> /// 获取中国银行外汇牌价,数组中0、1位置为英镑买入、卖出,2、3为美元买入、卖出,4、5为日元买入、卖出,6、7为欧元买入、卖出,对应【汇买、汇卖】 /// </summary> /// <param name="strResponse"></param> /// <returns></returns> private ArrayList GetBOCData(string strResponse) { string sGetHtml = string.Empty; ArrayList arylst = new ArrayList(); string[] sMoneyTypeList = new string[] { "英镑", "美元", "日元", "欧元" }; for (int m = 0; m < sMoneyTypeList.Length; m++) { sGetHtml = GetBOCHtml(strResponse, sMoneyTypeList[m]).Replace("<tdbgcolor=\"#FFFFFF\">", "").Replace("</td>", "|").Replace("</tr>","").TrimEnd('|'); string[] sData = sGetHtml.Split('|'); for (int n = 0; n < sData.Length; n++) { if (n != 1 && n != 3) { continue; } else { arylst.Add(sData[n]); } } } return arylst; } #endregion #region 中国建设银行外汇牌价 /// <summary> /// 获取中国建设银行外汇牌价四种币种内容 /// </summary> /// <param name="strResponse"></param> /// <param name="sMoneyType"></param> /// <returns></returns> private string GetCCBHtml(string strResponse, string sMoneyType) { string sTableHtml = string.Empty; string sResult = string.Empty; string s = string.Empty; MatchCollection mTableHtmlCollection = Regex.Matches(strResponse, "\\b"+sMoneyType+"(.|\n)+?</tr>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mTableHtml in mTableHtmlCollection) { sTableHtml += mTableHtml.Value.Replace("\t", "").Trim(); } MatchCollection mCollection = Regex.Matches(sTableHtml, "\\bVerdana\">(.|\n)+?</span></div>", RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match mHtml in mCollection) { sResult += mHtml.Value; } return sResult; } /// <summary> /// 获取中国建设银行外汇牌价,数组中0、1位置为英镑买入、卖出,2、3为美元买入、卖出,4、5为日元买入、卖出,6、7为欧元买入、卖出。对应【汇买、汇卖】 /// </summary> /// <param name="strResponse"></param> /// <returns></returns> private ArrayList GetCCBData(string strResponse) { string sGetHtml = string.Empty; ArrayList arylst = new ArrayList(); string[] sMoneyTypeList = new string[] { "tranCode\\(\"826\"\\)", "tranCode\\(\"840\"\\)", "tranCode\\(\"392\"\\)", "tranCode\\(\"978\"\\)" }; for (int m = 0; m < sMoneyTypeList.Length; m++) { sGetHtml = GetCCBHtml(strResponse, sMoneyTypeList[m]).Replace("Verdana\">", "").Replace("</span></div>", "|").TrimEnd('|'); string[] sData = sGetHtml.Split('|'); for (int n = 0; n < sData.Length; n++) { if (n != 0 && n != 1) { continue; } else { arylst.Add(Convert.ToDouble(sData[n])*100); } } } return arylst; } #endregion #region 下载网页内容到本地 /// <summary> /// url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 /// </summary> /// <param name="url"></param> /// <param name="charSet"></param> /// <returns></returns> private string GetPageData(string url, string charSet) { string strWebData = ""; if (url != null || url.Trim() != "") { WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient // 需要注意的: //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 //这是就要具体问题具体分析比如在头部加入cookie // webclient.Headers.Add("Cookie", cookie); //这样可能需要一些重载方法。根据需要写就可以了 //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 myWebClient.Credentials = CredentialCache.DefaultCredentials; //如果服务器要验证用户名,密码 //NetworkCredential mycred = new NetworkCredential(struser, strpassword); //myWebClient.Credentials = mycred; //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) byte[] myDataBuffer = myWebClient.DownloadData(url); strWebData = Encoding.Default.GetString(myDataBuffer); //获取网页字符编码描述信息 Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline); string webCharSet = charSetMatch.Groups[2].Value; if (charSet == null || charSet == "") { //如果未获取到编码,则设置默认编码 if (webCharSet == null || webCharSet == "") { charSet = "UTF-8"; } else { charSet = webCharSet; } } if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default) { strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer); } } return strWebData; } /// <summary> /// 下载网页,url是要访问的网站地址 /// </summary> /// <param name="url"></param> /// <returns></returns> private string GetPageData(string url) { string responseData = ""; if (url != null || url != "") { try { HttpWebRequest Req = (HttpWebRequest)WebRequest.Create(new System.Uri(url)); Req.UserAgent = "Mozilla/4.0(compatible;MSIE 6.0;Windows NT 5.0; .NET CLR 1.1.4322)"; Req.Timeout = 30000; StreamReader responseReader = new StreamReader(Req.GetResponse().GetResponseStream(), Encoding.Default); responseData = responseReader.ReadToEnd(); responseReader.Close(); } catch(Exception ex) { Response.Write(ex.Message); } } return responseData; } #endregion }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值