采集新闻
方便扩展
存储到xml
步骤:
1、找对象 文章视为对象 每一个网站视为对象
2、为了方便扩展做类似于计算器的操作
把采集的网站视为对象,所有的网站都能够采集 和保存成xml
所以抽象出父类WebSite 抽象类
实现具体的子类cnbeta sina等
3、WebSite 抽象类{ Name(网站名字 只读) Path xml保存路径 Url采集的url,抽象方法 Load采集新闻,Save把新闻保存到xml中}
4、cnbeta 继承WebSite{ }
donews
5、窗体加载时候根据反射读取每个继承自WebSite的子类的名字,添加到下拉框中
6、点采集按钮时候。根据下拉框中的内容创建具体的子类,执行采集方法
7、点保存按钮的时候 把采集到的新闻集合,存储在xml中
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Caiji
{
abstract class WebSite
{
public abstract string Name
{
get;
}
private string url;
public string Url
{
get { return url; }
set { url = value; }
}
private string path;
public string Path
{
get { return path; }
set { path = value; }
}
//采集的方法
public abstract List<Article> Load();
public abstract void Save();
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Caiji
{
class Article
{
private string author;
public string Author
{
get { return author; }
set { author = value; }
}
private string title;
public string Title
{
get { return title; }
set { title = value; }
}
private string content;
public string Content
{
get { return content; }
set { content = value; }
}
private DateTime sendTime;
public DateTime SendTime
{
get { return sendTime; }
set { sendTime = value; }
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.Xml;
namespace Caiji
{
class CnBeta:WebSite
{
public override string Name
{
get
{
return "cnbeta";
}
}
private List<Article> articles = new List<Article>();
//采集
public override List<Article> Load()
{
WebClient wc = new WebClient();
wc.Encoding = Encoding.UTF8;
string content =wc.DownloadString(Url);
string reg = @"<div\s+class=""title"">.+?[>](?<title>.+?)</a>.+?<div\s+class=""tj"">.+?<span>(?<author>.+?)发布于<em>(?<time>\d{4}\-\d{2}\-\d{2}\s+\d{2}:\d{2}:\d{2}).+?<strong>(?<content>.+?)</p>";
MatchCollection mc= Regex.Matches(content, reg,RegexOptions.Singleline);
foreach (Match match in mc)
{
if (match.Success)
{
//把采集到内容 转化成文章对象
Article article = new Article();
article.Author = match.Groups["author"].Value;
article.Content = match.Groups["content"].Value;
//替换掉内容中的html标签
article.Content = Regex.Replace(article.Content, "<.+?>", "");
article.Title = match.Groups["title"].Value;
article.SendTime = DateTime.Parse(match.Groups["time"].Value);
//
articles.Add(article);
}
}
return articles;
}
public override void Save()
{
//当文件不存在,去创建xml
if (!File.Exists(base.Path))
{
CreateXml();
}
else
{
SaveXml();
}
}
private void SaveXml()
{
XmlDocument doc = new XmlDocument();
doc.Load(base.Path);
XmlElement news = doc.DocumentElement;
//生成new节点
foreach (Article article in articles)
{
XmlElement nw = doc.CreateElement("New");
news.AppendChild(nw);
CreateNode(doc, article.Title, nw, "Title");
CreateNode(doc, article.Author, nw, "Author");
CreateNode(doc, article.SendTime.ToString(), nw, "Time");
CreateNode(doc, article.Content, nw, "Content");
}
doc.Save(base.Path);
}
private void CreateXml()
{
XmlDocument doc = new XmlDocument();
XmlDeclaration dec = doc.CreateXmlDeclaration("1.0", "utf-8", null);
doc.AppendChild(dec);
//生成根节点News
XmlElement news = doc.CreateElement("News");
doc.AppendChild(news);
//生成new节点
foreach (Article article in articles)
{
XmlElement nw = doc.CreateElement("New");
news.AppendChild(nw);
CreateNode(doc, article.Title, nw,"Title");
CreateNode(doc, article.Author, nw, "Author");
CreateNode(doc, article.SendTime.ToString(), nw, "Time");
CreateNode(doc, article.Content, nw, "Content");
}
doc.Save(base.Path);
}
private static void CreateNode(XmlDocument doc, string content, XmlElement nw,string element)
{
XmlElement title = doc.CreateElement(element);
title.InnerText =content;
nw.AppendChild(title);
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Caiji
{
class Factory
{
public static WebSite CreateInstance(string type)
{
WebSite site = null;
switch (type.ToLower())
{
case "cnbeta":
site = new CnBeta();
site.Path = "cb.xml";
site.Url = "cnBeta.COM_中文业界资讯站.htm";
break;
case "donews":
site = new Donews();
site.Path = "donews.xml";
site.Url = "DoNews-IT.htm";
break;
}
return site;
}
}
}
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Reflection;
namespace Caiji
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
//获取当前程序集
Assembly ass = this.GetType().Assembly;
//找到程序集中的所有类型
Type[] types = ass.GetTypes();
//再在所有类型中找WebSite的子类
foreach (Type type in types)
{
if (type.IsSubclassOf(typeof(WebSite)) && !type.IsAbstract)
{
//动态创建子类的对象 site.Name就是网站的名称
WebSite site = Activator.CreateInstance(type) as WebSite;
if (site != null)
{
comboBox1.Items.Add(site.Name);
}
}
}
//ListViewItem lvi = new ListViewItem("标题");
//lvi.SubItems.Add("作者");
//lvi.SubItems.Add("时间");
//listView1.Items.Add(lvi);
}
//Donews cb = new Donews();
WebSite site;
private void btnLoad_Click(object sender, EventArgs e)
{
//清空内容
listView1.Items.Clear();
//调用工厂创建子类对象
site = Factory.CreateInstance(comboBox1.Text);
//采集网站
List<Article> list = site.Load();
foreach (Article item in list)
{
//把新闻显示在listview中
ListViewItem lvi = new ListViewItem(item.Title);
lvi.SubItems.Add(item.Author);
lvi.SubItems.Add(item.SendTime.ToString());
listView1.Items.Add(lvi);
}
//cb.Url = "http://localhost/donews.htm";
//cb.Path = "donews.xml";
//List<Article> list = cb.Load();
//foreach (Article item in list)
//{
// ListViewItem lvi = new ListViewItem(item.Title);
// lvi.SubItems.Add(item.Author);
// lvi.SubItems.Add(item.SendTime.ToString());
// listView1.Items.Add(lvi);
//}
}
private void btnSave_Click(object sender, EventArgs e)
{
//把采集到的新闻保存到xml中
site.Save();
MessageBox.Show("保存成功!");
//cb.Save();
}
private void listView1_DoubleClick(object sender, EventArgs e)
{
if (listView1.SelectedItems.Count > 0)
{
MessageBox.Show(listView1.SelectedItems[0].SubItems[0].Text);
}
}
private void listView1_SelectedIndexChanged(object sender, EventArgs e)
{
}
}
}