c#采集

本文介绍了一个简单的网页爬虫实现过程,包括使用C#进行页面抓取、正则表达式匹配内容、多线程处理及数据解析入库等步骤,展示了如何自动化抓取网页信息并进行分析。

using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Text.RegularExpressions;
using System.Threading;
using System.net;
using System.IO;

namespace WindowsApplication1
{
    /**//// <summary>
    /// Form1 的摘要说明。
    /// </summary>
    public class Form1 : System.Windows.Forms.Form
    {
        private System.Windows.Forms.TextBox HttpUrl;
        private System.Windows.Forms.Button button1;
        private System.Windows.Forms.TextBox textBox2;
        private System.Windows.Forms.Button button2;
        private System.Windows.Forms.ListBox listBox1;
        private System.Windows.Forms.ListBox listBox2;
        private System.Windows.Forms.TextBox textBox1;
        private System.Windows.Forms.Button button3;
        private System.Windows.Forms.Button button4;
        private System.Windows.Forms.ProgressBar progressBar1;
        private System.Windows.Forms.TextBox textBox3;
        private System.Windows.Forms.TextBox textBox4;
        private System.Windows.Forms.TextBox textBox5;
        /**//// <summary>
        /// 必需的设计器变量。
        /// </summary>
        private System.ComponentModel.Container components = null;

        public Form1()
        {
            //
            // Windows 窗体设计器支持所必需的
            //
            InitializeComponent();

            //
            // TODO: 在 InitializeComponent 调用后添加任何构造函数代m
            //
        }

        /**//// <summary>
        /// 清理所有正在使用的资源。
        /// </summary>
        protected override void Dispose( bool disposing )
        {
            if( disposing )
            {
                if (components != null)
                {
                    components.Dispose();
                }
            }
            base.Dispose( disposing );
        }

        Windows 窗体设计器生成的代m#region Windows 窗体设计器生成的代m
        /**//// <summary>
        /// 设计器支持所需的方法 - 不要使用代m编辑器修改
        /// 此方法的内容。
        /// </summary>
        private void InitializeComponent()
        {
            this.HttpUrl = new System.Windows.Forms.TextBox();
            this.button1 = new System.Windows.Forms.Button();
            this.textBox2 = new System.Windows.Forms.TextBox();
            this.button2 = new System.Windows.Forms.Button();
            this.listBox1 = new System.Windows.Forms.ListBox();
            this.listBox2 = new System.Windows.Forms.ListBox();
            this.textBox1 = new System.Windows.Forms.TextBox();
            this.button3 = new System.Windows.Forms.Button();
            this.button4 = new System.Windows.Forms.Button();
            this.progressBar1 = new System.Windows.Forms.ProgressBar();
            this.textBox3 = new System.Windows.Forms.TextBox();
            this.textBox4 = new System.Windows.Forms.TextBox();
            this.textBox5 = new System.Windows.Forms.TextBox();
            this.SuspendLayout();
            //
            // HttpUrl
            //
            this.HttpUrl.Location = new System.Drawing.Point(16, 16);
            this.HttpUrl.Name = "HttpUrl";
            this.HttpUrl.Size = new System.Drawing.Size(280, 21);
            this.HttpUrl.TabIndex = 0;
            this.HttpUrl.Text = "http://www.playasp.com/article/22/ArticleList22_1.html";
            this.HttpUrl.TextChanged += new System.EventHandler(this.textBox1_TextChanged);
            //
            // button1
            //
            this.button1.Location = new System.Drawing.Point(312, 16);
            this.button1.Name = "button1";
            this.button1.TabIndex = 1;
            this.button1.Text = "读取网站";
            this.button1.Click += new System.EventHandler(this.button1_Click);
            //
            // textBox2
            //
            this.textBox2.Location = new System.Drawing.Point(16, 56);
            this.textBox2.Multiline = true;
            this.textBox2.Name = "textBox2";
            this.textBox2.Size = new System.Drawing.Size(424, 80);
            this.textBox2.TabIndex = 2;
            this.textBox2.Text = "textBox2";
            //
            // button2
            //
            this.button2.Location = new System.Drawing.Point(24, 288);
            this.button2.Name = "button2";
            this.button2.TabIndex = 4;
            this.button2.Text = "配列表";
            this.button2.Click += new System.EventHandler(this.button2_Click);
            //
            // listBox1
            //
            this.listBox1.ItemHeight = 12;
            this.listBox1.Location = new System.Drawing.Point(16, 144);
            this.listBox1.Name = "listBox1";
            this.listBox1.Size = new System.Drawing.Size(424, 64);
            this.listBox1.TabIndex = 5;
            //
            // listBox2
            //
            this.listBox2.ItemHeight = 12;
            this.listBox2.Location = new System.Drawing.Point(16, 216);
            this.listBox2.Name = "listBox2";
            this.listBox2.Size = new System.Drawing.Size(424, 64);
            this.listBox2.TabIndex = 6;
            //
            // textBox1
            //
            this.textBox1.Location = new System.Drawing.Point(16, 328);
            this.textBox1.Multiline = true;
            this.textBox1.Name = "textBox1";
            this.textBox1.Size = new System.Drawing.Size(424, 96);
            this.textBox1.TabIndex = 7;
            this.textBox1.Text = "textBox1";
            //
            // button3
            //
            this.button3.Location = new System.Drawing.Point(240, 520);
            this.button3.Name = "button3";
            this.button3.TabIndex = 8;
            this.button3.Text = "读取内容";
            this.button3.Click += new System.EventHandler(this.button3_Click);
            //
            // button4
            //
            this.button4.Location = new System.Drawing.Point(360, 520);
            this.button4.Name = "button4";
            this.button4.TabIndex = 9;
            this.button4.Text = "清空数据";
            this.button4.Click += new System.EventHandler(this.button4_Click);
            //
            // progressBar1
            //
            this.progressBar1.Location = new System.Drawing.Point(16, 488);
            this.progressBar1.Name = "progressBar1";
            this.progressBar1.Size = new System.Drawing.Size(416, 23);
            this.progressBar1.TabIndex = 10;
            this.progressBar1.Click += new System.EventHandler(this.progressBar1_Click);
            //
            // textBox3
            //
            this.textBox3.Location = new System.Drawing.Point(120, 288);
            this.textBox3.Name = "textBox3";
            this.textBox3.Size = new System.Drawing.Size(320, 21);
            this.textBox3.TabIndex = 11;
            this.textBox3.Text = "textBox3";
            //
            // textBox4
            //
            this.textBox4.Location = new System.Drawing.Point(16, 440);
            this.textBox4.Multiline = true;
            this.textBox4.Name = "textBox4";
            this.textBox4.Size = new System.Drawing.Size(416, 40);
            this.textBox4.TabIndex = 12;
            this.textBox4.Text = "textBox4";
            //
            // textBox5
            //
            this.textBox5.Location = new System.Drawing.Point(472, 64);
            this.textBox5.Multiline = true;
            this.textBox5.Name = "textBox5";
            this.textBox5.Size = new System.Drawing.Size(232, 448);
            this.textBox5.TabIndex = 13;
            this.textBox5.Text = "textBox5";
            //
            // Form1
            //
            this.AutoScaleBaseSize = new System.Drawing.Size(6, 14);
            this.ClientSize = new System.Drawing.Size(712, 549);
            this.Controls.Add(this.textBox5);
            this.Controls.Add(this.textBox4);
            this.Controls.Add(this.textBox3);
            this.Controls.Add(this.progressBar1);
            this.Controls.Add(this.button4);
            this.Controls.Add(this.button3);
            this.Controls.Add(this.textBox1);
            this.Controls.Add(this.listBox2);
            this.Controls.Add(this.listBox1);
            this.Controls.Add(this.button2);
            this.Controls.Add(this.textBox2);
            this.Controls.Add(this.button1);
            this.Controls.Add(this.HttpUrl);
            this.Name = "Form1";
            this.Text = "Form1";
            this.ResumeLayout(false);

        }
        #endregion

        /**//// <summary>
        /// 应用程序的主入口点。
        /// </summary>
        [STAThread]
        static void Main()
        {
            Application.Run(new Form1());
        }

        private void textBox1_TextChanged(object sender, System.EventArgs e)
        {
       
        }

   
        private void button1_Click(object sender, System.EventArgs e)
        {
            string get_url=HttpUrl.Text;
            HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(get_url);
            HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();
            Stream MyInStream = null;
            MyInStream = MyResponse.GetResponseStream();
            long fileSizeInBytes = MyResponse.ContentLength;
            //创建文件流对象

            int length = 10240;
            byte[] buffer = new byte[10250];
            int bytesread = 0;
            string strtemp = "";
            while ((bytesread = MyInStream.Read(buffer, 0, length)) > 0)
            {    //把数据写入文件

                strtemp += System.Text.Encoding.Default.GetString(buffer, 0, bytesread);
            }

            textBox2.Text=strtemp;
        }

        private void button2_Click(object sender, System.EventArgs e)
        {
            string Match_Url="http://www.playasp.com/article/22/Article"+@"/d{4}_/d{1}.html";

            Regex re = new Regex(Match_Url);
            MatchCollection matches = re.Matches(textBox2.Text);
            System.Collections.IEnumerator enu = matches.GetEnumerator();
                         int j=0;       
           
           
            while (enu.MoveNext() && enu.Current != null)
            {
                Match match = (Match)(enu.Current);
                int kg = 0;
                for (int i = 1; i < listBox1.Items.Count; i++)
                {
                        string mytemp=listBox1.Items[i].ToString();
                    if (match.Value==mytemp )
                    {
                        kg = 1;
                        break;
                    }

                }
   
                if (kg == 0)
                {
                   
                    j++;
                    listBox1.Items.Add(match.Value);
                    Regex rr = new Regex(match.Value + "[^<]+>+([^<]*)</a>", RegexOptions.IgnoreCase);
                    Match mm = rr.Match(textBox2.Text);
                    while (mm.Success)
                    {
                        Group g11 = mm.Groups[1];
                        string mynewstitle = g11.ToString().Trim();
                        mm = mm.NextMatch();
                        listBox2.Items.Add(j.ToString()+";"+mynewstitle);
                    }

                }
            }
   
        int mycount = listBox1.Items.Count ;
        MessageBox.Show("链接地址提取" + mycount + "成功!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

        }

        private void button3_Click(object sender, System.EventArgs e)
        {
            //listBox1.Items.Clear();
            //listBox2.Items.Clear();
            Thread t = new Thread(new ThreadStart(geturl));
            t.Start();
        }

        private void button4_Click(object sender, System.EventArgs e)
        {
            textBox1.Text="";
            textBox2.Text="";
            listBox1.Items.Clear();
            listBox2.Items.Clear();

        }

 

            int myint;
        string newstitle="";
        string newcontent;

        public void geturl()//新闻入库
        {

           
            this.progressBar1.Maximum=listBox1.Items.Count;
            for (int i = 0; i < listBox1.Items.Count; i++)

            {
                textBox1.Text = listBox1.Items[i].ToString();
                string URL = this.textBox1.Text.Trim();
                //加"http://"标志
                if (URL.IndexOf(@"http://") == -1)
                {
                    URL = @"http://" + URL;
                }
                HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(URL);
                //发送请求,获取响应
                Stream MyInStream = null;
                try
                {
                    HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();             
                 
                    MyInStream = MyResponse.GetResponseStream();
                    long fileSizeInBytes = MyResponse.ContentLength;
                    //创建文件流对象

                    int length = 10240;
                    byte[] buffer = new byte[10250];
                    int bytesread = 0;
                    string strtemp = "";
                    //从网络读取数据
                    while ((bytesread = MyInStream.Read(buffer, 0, length)) > 0)
                    {    //把数据写入文件

                        strtemp += System.Text.Encoding.Default.GetString(buffer, 0, bytesread);
                        textBox1.Text = strtemp;
                    }
               
                }
                catch (Exception Err)
                {
                    MessageBox.Show("读取网页失败!错误是:" + Err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
                }
                finally
                {
                    //关闭流
                    if (MyInStream != null)
                    {
                        MyInStream.Close();
                    }

                }
                //分析所抓的数据
                //分析标题
   
                Regex rr = new Regex("<title>([^<]*)</title>", RegexOptions.IgnoreCase);
                Match mm = rr.Match(textBox1.Text.Replace("<title>'+document.title+'</title>",""));
                while (mm.Success)
                {
                    Group g11 = mm.Groups[1];
                    newstitle = g11.ToString().Trim();
                    mm = mm.NextMatch();
                    newstitle=newstitle.Replace("、", "");
                    newstitle = newstitle.Replace("、", "");
                    newstitle = newstitle.Replace(@"""", """);

                    textBox3.Text = newstitle;


                }
                //MessageBox.Show("到这里来", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
               
                try
                {
                    string mystring = textBox1.Text;
                     myint = mystring.IndexOf(@"相关文章") - mystring.IndexOf(@"<TD vAlign=top>");
                    newcontent = mystring.Substring(mystring.IndexOf(@"<TD vAlign=top>"), myint);

                    textBox4.Text=newcontent;
                }
                catch
                {
                    string mystring = textBox1.Text;
                    textBox4.Text=Convert.ToString(mystring.IndexOf(@"</TD></TR></TBODY></TABLE>")+"+"+mystring.IndexOf(@"<TD vAlign=top>"));
                }
                finally
                {

                }

 

                //分析完,开始提交数据


                //ASCIIEncoding encoding = new ASCIIEncoding();
                //if(newcontent=="error")
                //{
                //    newstitle = "error";
                          
                //}
                string postData = "newstitle="+newstitle;
                postData += "&newscontent="+newcontent;
               
                byte[] data = System.Text.Encoding.GetEncoding("GB2312").GetBytes(postData);


                // Prepare web request
                HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create("http://localhost/5jiuye/post.asp");

                myRequest.Method = "POST";
                myRequest.ContentType = "application/x-www-form-urlencoded";
                myRequest.ContentLength = data.Length;
                Stream newStream = myRequest.GetRequestStream();

                // Send the data.
                newStream.Write(data, 0, data.Length);
                newStream.Close();

                // Get response
               
                         
                      
                try
                {
                    HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
                    StreamReader reader = new StreamReader(myResponse.GetResponseStream(),System.Text.Encoding.Default);
                    string content = reader.ReadToEnd();
                    textBox5.Text = content;
                }
                catch (Exception Err)
                {
                    textBox1.Text = "错误" + Err;
                }

                textBox2.Text = i.ToString();
                this.progressBar1.Value = this.progressBar1.Value + 1;
                //提交完成
                if (this.progressBar1.Value == listBox1.Items.Count)
                {
                    MessageBox.Show("完成", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
                }

 

           
            }       

        }

        private void progressBar1_Click(object sender, System.EventArgs e)
        {
       
        }


}
}

简单C#信息采集工具实现 http://blog.youkuaiyun.com/xiaoxiao108/archive/2011/06/01/6458367.aspx 最近想整只爬虫玩玩,顺便熟悉下正则表达式。 开发环境 vs2008 sql2000 实现方法如下 1.先抓取网页代码 2.通过正则匹配出你需要的内容 比如http://www.soso.com/q?w=%C4%E3%BA%C3&pg=1 页面中 搜索结果的标题跟连接地址。具体可以根据你的需要填写合适的地址跟正则。 3.把匹配出的内容保存到数据库中。对其中的数据可以根据需要自己进行处理 具体实现代码 1.读取网页的代码 public static string GetDataFromUrl(string url) { string str = string.Empty; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); //设置Http头; request.AllowAutoRedirect = true; request.AllowWriteStreamBuffering = true; request.Referer = ""; request.Timeout = 10 * 1000; //request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)"; HttpWebResponse response = null; try { response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { //根据http应答头来判别编码 string Characterset = response.CharacterSet; Encoding encode; if (Characterset != "") { if (Characterset == "ISO-8859-1") { Characterset = "gb2312"; } encode = Encoding.GetEncoding(Characterset); } else { encode = Encoding.Default; } //声明一个内存流来贮存http应答流 Stream Receivestream = response.GetResponseStream(); MemoryStream mstream = new MemoryStream(); byte[] bf = new byte[255]; int count = Receivestream.Read(bf, 0, 255); while (count > 0) { mstream.Write(bf, 0, count); count = Receivestream.Read(bf, 0, 255); } Receivestream.Close(); mstream.Seek(0, SeekOrigin.Begin); //从内存流里读取字符串这里涉及到了编码方案 StreamReader reader = new StreamReader(mstream, encode); char[] buf = new char[1024]; count = reader.Read(buf, 0, 1024); while (count > 0) { str += new string(buf, 0, 1024); count = reader.Read(buf, 0, 1024); } reader.Close(); mstream.Close(); } } catch (Exception ex) { GetDataFromUrl(url); } finally { if (response != null) response.Close(); } return str; } 2.正则匹配的代码 public static ArrayList GetString(string reg, string content) { Regex r = new Regex(reg, RegexOptions.Compiled); MatchCollection matches = r.Matches(content); ArrayList a = new ArrayList(); foreach (Match m in matches) { string[] arr = new string[10]; arr[0] = m.Groups[1].Value; arr[1] = m.Groups[2].Value; arr[2] = m.Groups[3].Value; arr[3] = m.Groups[4].Value; arr[4] = m.Groups[5].Value; arr[5] = m.Groups[6].Value; arr[6] = m.Groups[7].Value; arr[7] = m.Groups[8].Value; arr[8] = m.Groups[9].Value; arr[9] = m.Groups[10].Value; a.Add(arr); } return a; } 3.如果抓取的页面很多 ,可以把多线程跟队列应用过来,提高抓取效率 Queue numbers = new Queue(); const int MaxCount = 5;//同时运行的最多线程数 private static object _lock = new object(); private void Test() { while (true) { int i = 0; lock (_lock) { if (numbers.Count == 0) { flag = false; return; } i = numbers.Dequeue(); } f(i); } } void Ssss() { for (int i = 1; i <= 100; i++)//处理的页面参数 从http://www.soso.com/q?w=你好&pg=1 到http://www.soso.com/q?w=你好&pg=100 { numbers.Enqueue(i); } for (int i = 0; i < MaxCount; i++) { Thread thread = new Thread(new ThreadStart(Test)); thread.Name = "T" + i.ToString(); thread.Start(); } } private void f(int num) { string str = ClassLibrary1.Class1.GetDataFromUrl("http://www.soso.com/q?w=%C4%E3%BA%C3&pg="+num); string reg = "]+? target=\"_blank\">([\\s\\S]+?)"; ArrayList a = ClassLibrary1.Class1.GetString(reg, str); for (int i = 0; i ] 除了>以为的字符 [\u4e00-\u9fa5] 汉字 6.代码只是实现了信息采集的主要功能,根据你自己的需要更换采集页面,跟合适的正则表达式后,可以根据你的需要自动进行采集,对采集到的数据,再根据你的需要自己进行处理。 7.数据库操作部分用的3层代码生成器连接地址 在 app.config中 如果你发现有什么不合理的,需要改进的地方,联系328452421@qq.com 朱晓 。相互交流 谢谢 顺便问下 有家是新泰的没,搞软件开发 地
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值