取得搜索引擎综合新闻并分页

本文介绍了一种使用C#从Google和Baidu等搜索引擎抓取新闻内容的方法,并通过过滤HTML和CSS代码来提取关键信息。实现了新闻内容的获取、过滤及翻页功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

先看效果:

取得搜索引擎的分类新闻

      左边为GOOGLE的关于“上海”的新闻,右边为Baidu中关于“上海”的新闻。由于GOOGLE的新闻有图片及相关新闻摘要,因此,左边十条,右边列出八十条与之对应。

      目前已初具雏型,使用C#代码编写。

贴出部分核心代码:

------------------------------------------------------

(以下仅做研究之用,勿用于其他目的。)

------------------------------------------------------

 

// INews.cs

using System;
using System.Collections.Generic;
using System.Text;

namespace NewsFromSearchingEngine
{
    public interface INews
    {
        bool IsError { get ; set ; }
        int PageNo { get ; set ; }
        int PerPageCount { get ; set ; }
        int NewsStartNo { get ; set ; }
    }
}


// INewsFromSearching.cs

using System;
using System.Collections.Generic;
using System.Text;

namespace NewsFromSearchingEngine
{
    public interface INewsFromSearching : INews
    {
        string ErrorMsgForGetNews { get ; set ; }
        bool IsErrorForGetNews { get ; set ; }
        string FromUrl { get; set; }
        string SearchKeywords { get ; set ; }
        string StartFilterHtmlCode { get ; set ; }
        string EndFilterHtmlCode { get ; set ; }
        string StartFilterCssCode { get ; set ; }
        string EndFilterCssCode { get ; set ; }
        string StartFilterFooterCode { get ; set ; }
        string EndFilterFooterCode { get ; set ; }
        bool IsTrimFooterEnd { get; set; }
    }
}

// NewsFromASearching.cs

using System;
using System.Collections.Generic;
using System.Text;

namespace NewsFromSearchingEngine
{
    /// <summary>
    /// 从搜索引擎获取新闻的抽象类
    /// </summary>
    public abstract class NewsFromASearching
    {
        /// <summary>
        /// 给定URL和编码方式,从搜索引擎获取新闻
        /// </summary>
        /// <param name="url">获取新闻的URL</param>
        /// <param name="enc">编码方式</param>
        /// <returns>返回新闻内容</returns>
        public virtual string GetNews(string url, Encoding enc)
        {
            return "GetNews(string url, Encoding enc)";
        }

        /// <summary>
        /// 给定URL,从搜索引擎获取新闻(编码方式为:Encoding.Default)
        /// </summary>
        /// <param name="url">获取新闻的URL</param>
        /// <returns>返回新闻内容</returns>
        public virtual string GetNews(string url)
        {
            return "GetNews(string url)";
        }

        /// <summary>
        /// 给定编码方式,从搜索引擎获取新闻(URL另行指定)
        /// </summary>
        /// <param name="enc">编码方式</param>
        /// <returns>返回新闻内容</returns>
        public virtual string GetNews(Encoding enc)
        {
            return "GetNews(Encoding enc)";
        }

        /// <summary>
        /// 过滤指定新闻内容的HTML代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回过滤后的HTML代码</returns>
        public virtual string FilterNews(string newsContent)
        {
            return "FilterNews(string newsContent)";
        }

        /// <summary>
        /// 取得指定新闻内容中的CSS相关代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回CSS相关代码</returns>
        public virtual string GetCss(string newsContent)
        {
            return "FilterNews(string newsContent)";
        }

        /// <summary>
        /// 取得指定新闻内容的结尾(Footer)代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回尾部代码</returns>
        public virtual string FilterFooter(string newsContent)
        {
            return "FilterFooter(string newsContent)";
        }

        /// <summary>
        /// 从内容中去掉从开始内容到结束内容之间的代码
        /// </summary>
        /// <param name="newsContent">内容</param>
        /// <param name="startFilterHtmlCode">开始代码</param>
        /// <param name="endFilterHtmlCode">结束代码</param>
        /// <param name="isTrimEnd">布尔值,是否去掉结束代码?</param>
        /// <returns>返回过滤后的代码</returns>
        public virtual string FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)
        {
            return "FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)";
        }
    }
}

// NewsFromSearchingEngine.cs

using System;
using System.Web.UI;
using System.Collections.Generic;
using System.Text;
using System.Web;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

namespace NewsFromSearchingEngine
{
    public class NewsFromSearchingEngine : NewsFromASearching, INewsFromSearching
    {
        string _newsFromSiteName = "google";
        public string NewsFromSiteName
        {
            get { return _newsFromSiteName; }
            set { _newsFromSiteName = value; }
        }

        int _perPageCount = 10;
        /// <summary>
        /// 每页新闻的显示条数
        /// </summary>
        public int PerPageCount
        {
            get { return _perPageCount; }
            set { _perPageCount = value; }
        }

        int _newsStartNo = 0;
        /// <summary>
        /// 新闻开始序号
        /// </summary>
        public int NewsStartNo
        {
            get { return _newsStartNo; }
            set { _newsStartNo = value; }
        }

        string _errorMsgForGetNews = @"<b>Sorry,发生意外错误!</b><br />可能由于网络通讯不畅而暂时无法访问或者参数有误。<br />";
        /// <summary>
        /// 抓取新闻发生错误时,指定的提示性语言
        /// </summary>
        public string ErrorMsgForGetNews
        {
            get { return _errorMsgForGetNews; }
            set { _errorMsgForGetNews = value; }
        }

        bool _isErrorForGetNews = false;
        /// <summary>
        /// 获取或设置抓取新闻时是否出错
        /// </summary>
        public bool IsErrorForGetNews
        {
            get { return _isErrorForGetNews; }
            set { _isErrorForGetNews = value; }
        }

        string _fromUrl = string.Empty;
        /// <summary>
        /// 获取或设置抓取新闻的URL目标地址
        /// </summary>
        public string FromUrl
        {
            get { return _fromUrl; }
            set { _fromUrl = value; }
        }

        string _searchKeywords = "上海";
        /// <summary>
        /// 获取或设置抓取新闻的关键字
        /// </summary>
        public string SearchKeywords
        {
            get { return _searchKeywords; }
            set { _searchKeywords = value; }
        }

        string _startFilterHtmlCode = string.Empty;
        /// <summary>
        /// 目标内容页开始的HTML代码
        /// </summary>
        public string StartFilterHtmlCode
        {
            get { return _startFilterHtmlCode; }
            set { _startFilterHtmlCode = value; }
        }

        string _endFilterHtmlCode = string.Empty;
        /// <summary>
        /// 目标内容页中实质性新闻内容开始的HTML代码。
        /// 一般以此为分界,在此之前的部分为头部内容,需要去掉除CSS相关代码外的其他内容。
        /// 紧接此分界代码后的代码为“实质性新闻内容”,这部分为新闻的核心,是需要保留的内容。
        /// 在GetNews(...)方法后,还需要去掉尾部(Footer)的内容,这样才能最终得到新闻的核心内容部分。
        /// </summary>
        public string EndFilterHtmlCode
        {
            get { return _endFilterHtmlCode; }
            set { _endFilterHtmlCode = value; }
        }

        string _startFilterCssCode = string.Empty;
        /// <summary>
        /// CSS代码开始的特征代码
        /// </summary>
        public string StartFilterCssCode
        {
            get { return _startFilterCssCode; }
            set { _startFilterCssCode = value; }
        }

        string _endFilterCssCode = string.Empty;
        /// <summary>
        /// CSS代码结束的特征代码
        /// </summary>
        public string EndFilterCssCode
        {
            get { return _endFilterCssCode; }
            set { _endFilterCssCode = value; }
        }

        string _startFilterFooterCode = string.Empty;
        /// <summary>
        /// 尾部开始的特征代码(用来去除搜索引擎网页尾部多余代码)
        /// </summary>
        public string StartFilterFooterCode
        {
            get { return _startFilterFooterCode; }
            set { _startFilterFooterCode = value; }
        }

        string _endFilterFooterCode = string.Empty;
        /// <summary>
        /// 尾部结束的特征代码(用来去除搜索引擎网页尾部多余代码)
        /// </summary>
        public string EndFilterFooterCode
        {
            get { return _endFilterFooterCode; }
            set { _endFilterFooterCode = value; }
        }

        bool _isTrimFooterEnd = true;
        /// <summary>
        /// 获取或设置是否去除尾部特征代码
        /// </summary>
        public bool IsTrimFooterEnd
        {
            get { return _isTrimFooterEnd; }
            set { _isTrimFooterEnd = value; }
        }

        bool _isError = false;
        /// <summary>
        /// 获取或设置抓取新闻是否出错。
        /// </summary>
        public bool IsError
        {
            get
            {
                return _isError;
            }
            set
            {
                _isError = value;
            }
        }

        int _pageNo = 1;
        /// <summary>
        /// 获取或设置页码
        /// </summary>
        public int PageNo
        {
            get { return _pageNo; }
            set { _pageNo = value; }
        }

        /// <summary>
        /// 给定URL和编码方式,从搜索引擎获取新闻
        /// </summary>
        /// <param name="url">获取新闻的URL</param>
        /// <param name="enc">编码方式</param>
        /// <returns>返回新闻内容</returns>
        public override string GetNews(string url, Encoding enc)
        {
            string result;
            WebRequest request = WebRequest.Create(url);
            request.ContentType = "application/x-www-form-urlencoded";
            request.Method = "Get";

            try
            {
                WebResponse response = request.GetResponse();
                Stream resStream = response.GetResponseStream();
                StreamReader sr = new StreamReader(resStream, enc);
                result = sr.ReadToEnd();
                resStream.Close();
                sr.Close();

                _isErrorForGetNews = false;
            }
            catch (WebException exc)
            {
                StringBuilder sbError = new StringBuilder();
                sbError.Append(_errorMsgForGetNews);
                sbError.Append(@"<!--");
                sbError.Append(@"所请求的网址是:<br />");
                sbError.Append(url);
                sbError.Append(@"-->");

                _isErrorForGetNews = true;

                result = sbError.ToString();
            }

            return result;
        }

        /// <summary>
        /// 给定URL,从搜索引擎获取新闻(编码方式为:Encoding.Default)
        /// </summary>
        /// <param name="url">获取新闻的URL</param>
        /// <returns>返回新闻内容</returns>
        public override string GetNews(string url)
        {
            return GetNews(url, Encoding.Default);
        }

        /// <summary>
        /// 给定编码方式,从搜索引擎获取新闻(URL另行指定)
        /// </summary>
        /// <param name="enc">编码方式</param>
        /// <returns>返回新闻内容</returns>
        public override string GetNews(Encoding enc)
        {
            if (string.IsNullOrEmpty(this.FromUrl))
            {
                return string.Empty;
            }
            return GetNews(this.FromUrl, enc);
        }

        /// <summary>
        /// 过滤指定新闻内容的HTML代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回过滤后的HTML代码</returns>
        public string FilterNews(string newsContent,bool isLinkCss)
        {
            if (_isErrorForGetNews)
            {
                return newsContent;
            }
            string result;
            try
            {
                int startIndex = newsContent.IndexOf(_startFilterHtmlCode);
                int removeLength = newsContent.IndexOf(_endFilterHtmlCode) - startIndex + _endFilterHtmlCode.Length;
                StringBuilder sb = new StringBuilder();
                StringBuilder sbNews = new StringBuilder(newsContent);
                sbNews.Remove(startIndex, removeLength);
                sb.AppendLine();
                sb.Append(GetCss(newsContent, isLinkCss));
                sb.Append(sbNews);
                result = sb.ToString();

                result = FilterFooter(result);

                return result;
            }
            catch (Exception exc)
            {
                result = "<b>抱歉,发生意外错误!</b><br />可能由于网络通讯不畅,暂时无法访问,或参数有误。";
                _isError = true;
                return result;
            }
        }
        /// <summary>
        /// 过滤指定新闻内容的HTML代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回过滤后的HTML代码</returns>
        public override string FilterNews(string newsContent)
        {
            return FilterNews(newsContent, false);
        }
        /// <summary>
        /// 取得指定新闻内容中的CSS相关代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回CSS相关代码</returns>
        public override string GetCss(string newsContent)
        {
            return GetCss(newsContent, false);
        }

        /// <summary>
        /// 取得指定新闻内容中的CSS相关代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回CSS相关代码</returns>
        public string GetCss(string newsContent, bool isLinkCss)
        {
            string result = string.Empty;
            if (isLinkCss)
            {
                //写成Css文件并加上链接
                if (this.NewsFromSiteName.StartsWith("google"))
                {
                    result = @"<link href=""gNews.css"" rel=""stylesheet"" type=""text/css"" />";
                }
                else if (this.NewsFromSiteName.StartsWith("baidu"))
                {
                    result = @"<link href=""bNews.css"" rel=""stylesheet"" type=""text/css"" />";
                }
                else
                {
                    result = string.Empty;
                }
                return result;
            }

            int startIndex = newsContent.IndexOf(_startFilterCssCode);
            int removeLength = newsContent.Length - newsContent.IndexOf(_endFilterCssCode);
            StringBuilder sb = new StringBuilder(newsContent);
            sb.Remove(newsContent.IndexOf(_endFilterCssCode), removeLength);
            sb.Remove(0, startIndex);
            sb.Append(_endFilterCssCode);
            result = sb.ToString();
            //string result = sb.ToString().Replace("h2{font-size:1.34em}", "h2{font-size:1.2em}");

            return result;
        }

        /// <summary>
        /// 取得指定新闻内容的结尾(Footer)代码
        /// </summary>
        /// <param name="newsContent">新闻内容</param>
        /// <returns>返回尾部代码</returns>
        public override string FilterFooter(string newsContent)
        {
            string result = newsContent;
            result = FilterStartToEnd(result, _startFilterFooterCode, _endFilterFooterCode, true);

            return result;
        }

        /// <summary>
        /// 从内容中去掉从开始内容到结束内容之间的代码
        /// </summary>
        /// <param name="newsContent">内容</param>
        /// <param name="startFilterHtmlCode">开始代码</param>
        /// <param name="endFilterHtmlCode">结束代码</param>
        /// <param name="isTrimEnd">布尔值,是否去掉结束代码?</param>
        /// <returns>返回过滤后的代码</returns>
        public override string FilterStartToEnd(string newsContent, string startFilterHtmlCode, string endFilterHtmlCode, bool isTrimEnd)
        {
            int startIndex = newsContent.IndexOf(startFilterHtmlCode);
            int removeLength = newsContent.IndexOf(endFilterHtmlCode) - startIndex;
            if (isTrimEnd)
            {
                removeLength += endFilterHtmlCode.Length;
            }
            StringBuilder sbNews = new StringBuilder(newsContent);
            sbNews.Remove(startIndex, removeLength);

            string result = sbNews.ToString();

            return result;
        }

        /// <summary>
        /// 增加翻页代码
        /// </summary>
        /// <param name="sessionKeywords">搜索关键字(可以保存到Session中)</param>
        /// <param name="perPageNumberDisplay">显示页码数量</param>
        /// <returns>返回翻页代码</returns>
        public string AddPager(string sessionKeywords, int perPageNumberDisplay)
        {
            return AddPager(sessionKeywords, perPageNumberDisplay, 999);
        }

        /// <summary>
        /// 增加翻页代码
        /// </summary>
        /// <param name="sessionKeywords">搜索关键字(可以保存到Session中)</param>
        /// <param name="perPageNumberDisplay">显示页码数量</param>
        /// <returns>返回翻页代码</returns>
        public string AddPager(string sessionKeywords, int perPageNumberDisplay, int maxPageNumber)
        {
            string currentUrl = VirtualPathUtility.GetFileName(System.Web.HttpContext.Current.Request.FilePath.ToString()).ToString();
            StringBuilder sbPager = new StringBuilder();
            sbPager.Append(@"<div id=""pageNum"" style=""margin-top:16px;"">");
            if (_pageNo > 1)
            {
                sbPager.Append(@"<a href=""" + currentUrl + "?pn=1&kw=" + sessionKeywords + @""">首页</a> &nbsp; ");
            }
            if (_pageNo > 2)
            {
                sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
                sbPager.Append((_pageNo - 1).ToString());
                sbPager.Append(@"&kw=" + sessionKeywords + @""">上一页</a> &nbsp; ");
            }

            for (int i = 0; i < perPageNumberDisplay; i++)
            {
                int tmpPageNo = _pageNo + i;
                if (maxPageNumber < tmpPageNo) break;

                if (tmpPageNo == _pageNo)
                {
                    sbPager.Append("<font color=red size+><b>");
                    sbPager.Append((_pageNo + i).ToString() + "</b></font> &nbsp; ");
                }
                else
                {
                    if (_pageNo < 5)
                    {
                        sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
                        sbPager.Append((_pageNo + i).ToString());
                        sbPager.Append(@"&kw=" + sessionKeywords + @""">[" + (_pageNo + i).ToString() + "]</a> &nbsp; ");
                    }
                    else
                    {
                        sbPager.Append(@"<a href=""" + currentUrl + "?pn=");
                        sbPager.Append((_pageNo + i).ToString());
                        sbPager.Append(@"&kw=" + sessionKeywords + @""">[" + (_pageNo + i).ToString() + "]</a> &nbsp; ");
                    }
                }
            }
            sbPager.Append(@"</div>");

            return sbPager.ToString();
        }
    }
}

 

OK!

 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值