C#程序设计之多线程爬虫程序

最新推荐文章于 2025-07-08 10:47:27 发布

马衍硕

最新推荐文章于 2025-07-08 10:47:27 发布

阅读量4.5k

点赞数 2

CC 4.0 BY-SA版权

分类专栏： c# 文章标签：多线程爬虫正则表达式开发语言设计

本文链接：https://blog.youkuaiyun.com/mmayanshuo/article/details/78511330

c# 专栏收录该内容

1 篇文章

订阅专栏

一、简单介绍：
技术方面主要包括：
（1）技术选型：
1）课程设计使用的开发语言是C#。
2）课程设计选用了文件流方式获取网站数据。
3）课程设计使用多线程抓取网页代码。
4）课程设计使用了正则表达式对源码进行解析处理。

(2)程序运行流程：
通过图示可以更形象的了解程序运行的整个流程：
1）程序首先下载网站首页的源代码
2）对首页源代码进行分析，提取出网站建设类目下的链接并存储到队列中。
3）运用多线程，分别同时下载队列中的链接。
4）利用正则表达式对下载的链接源码进行分析。提取图片的URL并下载图片。截取需要的文本信息。
5）保存下载的文本和图片。把下载分析链接的信息显示在操作界面上。

程序的整体流程：
这里写图片描述

二、再贴代码：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Threading;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Diagnostics;

namespace crawlWebsiteAndExtractInfo
{

    public partial class frmCrawlWebsite : Form
    {
        public static Queue<String> q = new Queue<string>();
        public static string[] surl = new string[100];       //照片冗余
        public static int a = 0;
        public static int j = 0;
        //public static int m = 0;
        public static bool b = false;
        public static object locker = new object();//添加一个对象作为锁
        public static object locker1 = new object();
        public static object locker2 = new object();
        public static object locker3 = new object();
        public static Stopwatch watch = new Stopwatch();
        public static bool flag=true;
        public static string textbox = string.Empty;
        public int Num=5;
         //string[] surl = new string[300];
        public frmCrawlWebsite()
        {
            InitializeComponent();
        }

        private void btnCrawlAndExtract_Click(object sender, EventArgs e)
        {

            //获得网址
            //http://www.hyzbi.com
            string urlToCrawl = txbUrlToCrawl.Text;

            //HTTP请求
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
            //GET方法
            req.Method = "GET";
            //获得HTTP回复
            HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
            //定义编码方式
            string htmlCharset = "utf-8";
            //编码方式
            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
            //显示html内容
            string respHtml = sr.ReadToEnd();
            rtbExtractedHtml.Text = respHtml;
        }

        private void btnExtractInfo_Click(object sender, EventArgs e)
        {
            //找到网站建设类目下面的连接         
            string h1userP = @"/portal/article/index/cid/19/id/(\d+)";
            //捕获匹配
            MatchCollection foundH1user = (new Regex(h1userP)).Matches(rtbExtractedHtml.Text);
            foreach (Match m in foundH1user)
            {

                string url = "http://www.hyzbi.com" + (string)m.Value;

                q.Enqueue(url);


            }
            Thread[] downloadThread;//声名下载线程

            downloadThread = new Thread[21];//为线程申请资源，确定线程总数
            //int i=0;
           // richTextBox1.Text = "aaa";

            watch.Start();
            for (int i = 0; i <Num; i++)
            {
                ThreadStart startDownload = new ThreadStart(DownLoad);
                //ParameterizedThreadStart startDownload = new ParameterizedThreadStart(DownLoad);

                downloadThread[i] = new Thread(startDownload);//指定线程起始设置
                downloadThread[i].Start();//逐个开启线程
            }

            //while (q.Count != 0) ;




        }
        public delegate void ProcessDelegate();
        public void richTextShow(string ss)
        {
            string s=ss;
            richTextBox1.Text=s;

        }

        public void picture(string rrh)
        {
            string rh = rrh;
            string pp = @"src\s*=\s*[""']?([^'"" >]+?)[ '""][^>]*?>";
            MatchCollection found = (new Regex(pp)).Matches(rh);

            foreach (Match mm in found)
            {

                string urll = "http://www.hyzbi.com" + (string)mm.Groups[1].Value;
                int id = Array.IndexOf(surl, urll);
                //exists = ((IList)surl).Contains(urll);
                if (id == -1)
                {
                    surl[a] = urll;
                    a++;
                    try
                    {
                        Bitmap img = null;
                        HttpWebRequest req = (HttpWebRequest)(WebRequest.Create(urll));
                        req.Method = "GET";
                        HttpWebResponse res = (HttpWebResponse)(req.GetResponse());
                        img = new Bitmap(res.GetResponseStream());
                        lock (locker3)
                        {
                            img.Save(@"e:/c/" + a + ".jpg");
                        }
                        //m++;
                    }
                    catch (Exception ee)
                    {

                    }

                }
            }
        }
       // public void picture save()
        public string wenben(string rrh)
        {
            string rh = rrh;
            //抓取标题
            //string hstrOutput=null;
            string bt = @"<h2>(.*)</h2>";
            //StreamWriter sw = new StreamWriter("e:/b.txt", true);
            //MatchCollection bbt = (new Regex(bt)).Matches(rh);
            Match nnn = (new Regex(bt)).Match(rh);

           // foreach (Match nn in bbt)
            //{
                //richTextBox1.Text += nn.Value + "\n";
                Regex hregex = new Regex("<.+?>", RegexOptions.IgnoreCase);
                string hstrOutput = hregex.Replace(nnn.Value, "");//替换掉"<"和">"之间的内容
                hstrOutput = hstrOutput.Replace("<", "");
                hstrOutput = hstrOutput.Replace(">", "\r");
                hstrOutput = hstrOutput.Replace(" ", "");
               // sw.WriteLine(hstrOutput);
               // sw.WriteLine("\n");
           // }
                string p = @"<p.*>(<span .*>)?(.*)(</span>)?</p>";
                MatchCollection tp = (new Regex(p)).Matches(rh);
                lock (locker)
                {
                    StreamWriter sw = new StreamWriter("e:/b.txt", true);
                    sw.WriteLine(hstrOutput);
                    sw.WriteLine("\n");
                    //StreamWriter sw = new StreamWriter("e:/b.txt", true);
                    //string p = @"<p.*>(<span .*>)?(.*)(</span>)?</p>";
                    //MatchCollection tp = (new Regex(p)).Matches(rh);
                    foreach (Match n in tp)
                    {
                        Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
                        string strOutput = regex.Replace(n.Value, "");//替换掉"<"和">"之间的内容

                        strOutput = strOutput.Replace("<", "");

                        strOutput = strOutput.Replace(">", "\r");

                        strOutput = strOutput.Replace(" ", "");

                        //sw.WriteLine(n.Groups[0].Value);
                        sw.WriteLine(strOutput);
                        sw.WriteLine("\n");
                    }
                    sw.Close();
                }
            return hstrOutput;
        }
        public  void DownLoad()
        {

                while (true)
                {
                    string url;
                    string h2;
                    if (q.Count != 0)
                    {
                        lock (locker1)
                        {
                            url = q.Dequeue();
                            j++;
                        }
                        //richTextBox1.Text = url;
                        try
                        {
                            HttpWebRequest rr = (HttpWebRequest)WebRequest.Create(url);
                            rr.Method = "GET";
                            HttpWebResponse resp = (HttpWebResponse)rr.GetResponse();

                            string htmlCharset = "utf-8";
                            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);
                            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);
                            string rh = sr.ReadToEnd();
                            picture(rh);
                            h2 = wenben(rh);
                            lock (locker2)
                            {

                                ProcessDelegate showProcess = delegate()
                                {

                                    richTextBox1.AppendText(url + h2 + DateTime.Now.ToString() + "\n");

                                };

                                richTextBox1.Invoke(showProcess);

                            }



                        }
                        catch (Exception eee)
                        {

                        }
                    }
                    else
                    {
                       /* flag = false;
                        watch.Stop();
                        string time = watch.ElapsedMilliseconds.ToString();
                        ProcessDelegate showTime = delegate()
                        {

                            richTextBox1.AppendText("共用时" + time + "\n");

                        };

                        richTextBox1.Invoke(showTime);*/
                        break;
                    }

                }
        }



       // }

        private void button1_Click(object sender, EventArgs e)
        {
            //string path1 = @"e:\c";  //打开D盘下的log.txt文件
            //System.Diagnostics.Process.Start(path1);
            string path2 = @"e:\c";  //调用资源管理器，打开e盘下的c文件夹
            System.Diagnostics.Process.Start("explorer", path2);
        }

        private void button2_Click(object sender, EventArgs e)
        {

                string path1 = @"e:\b.txt";
                System.Diagnostics.Process.Start("explorer",path1);

        }

        private void button3_Click(object sender, EventArgs e)
        {
            try
            {
                textbox = this.textBox1.Text;
                Num = Convert.ToInt32(textbox);
            }
            catch (Exception eee)
            {

            }

        }

    }
}

三、多说一点：
（1）文件流方式：
C#中通常有三种方法获取网页内容。第一种方式为：使用webclient、第二种方式为：webBrowser、第三种方式为HttpWebRequest/HttpWebResponse
。此次程序设计中选用的是第三种方式即HttpWebResquest/HttpWebResponse方式。这是一种比较通用的获取方式。
（2）c#中的多线程：
在Visul C#中System.Threading 命名空间提供一些使得可以进行多线程编程的类和接口，其中线程的创建有以下三种方法：Thread、ThreadPool、Timer。
本次课程设计中选用的是Thread方式。这也许是最复杂的方法，但它提供了对线程的各种灵活控制。首先你必须使用它的构造函数创建一个线程实例，它的参数比较简单，只有一个ThreadStart 委托：
public Thread(ThreadStart start);
然后调用Start（）启动它。
（3）正则表达式：
在编写处理字符串的程序或网页时，经常会有查找符合某些复杂规则的字符串的需要。正则表达式就是用于描述这些规则的工具。换句话说，正则表达式就是记录文本规则的代码。
在此次课程设计中，使用了正则表达式对网页的源码进行截取和分析，以获得想要的数据信息。
程序流程详解：
（1）程序使用文件流方式从首页hyzbi.com中下载整个页面的源代码。
（2）通过使用正则表达式，将网站建设类目下的网页URL截取下来，并保存在一个队列中。
（3）启用多线程，每个线程中从队列中拿取一个URL。此时此时使用了互斥锁，避免线程冲突。线程每拿取一个URL就将队列中的URL记录删除，避免重复下载。线程中同样运用文件流的方式下载网页URL的源代码。并封装了两个函数，分别对应图片和文本的下载分析。
当线程工作完成时，将完成时间以及完成项目显示在操作界面上。
（4）对于图面的解析，使用正则表达式截取网页URL中所有的图片URL。把图片URL保存在数组中，美获得一个URL就和数组中的项比较，有重复项就舍弃，无重复项就下载，这样就做到了图片的冗余，避免下载重复的图片。
（5）对于文本的解析，则使用了正则表达式的截取和替代。在文本的保存上，也加了一个互斥锁，使同一时刻只有一个线程可以访问本地文件保存文本。
四、程序运行结果：
1、主程序界面
这里写图片描述
2、爬取的网站中的图片信息：

3、爬取的网站中的文本信息：