C#爬虫

C#爬虫之WebClient、WebRequest、WebResponse应用

WebClient 获取网页Html

 WebClient MyWebClient = new WebClient();
                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
                Byte[] pageData = MyWebClient.DownloadData(url); //从指定网站下载数据
                pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句

请求数据并保存为文件

try
            {
                WebRequest request = WebRequest.Create(urls);//图片src内容
                WebResponse response = request.GetResponse();
                //文件流获取图片操作
                Stream reader = response.GetResponseStream();
                string path = "H://imgs//" + count.ToString() + "//" +aa.ToString()+"//"+ i.ToString() + ".jpg";        //图片路径命名 
                FileStream writer = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);
                byte[] buff = new byte[512];
                int c = 0;                                           //实际读取的字节数   
                while ((c = reader.Read(buff, 0, buff.Length)) > 0)
                {
                    writer.Write(buff, 0, c);
                }
                //释放资源
                writer.Close();
                writer.Dispose();
                reader.Close();
                reader.Dispose();
                response.Close();
                //下载成功

            }
            catch (Exception msg)
            {
                Console.Write(msg.Message);
                return;
            }

利用正则表达式和队列可以爬取网页上的图片,请看下面例子:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections;

namespace ConsoleApp16
{
    class Program
    {
        public static string HtmlText(string url)
        {
            string pageHtml = "";
            try
            {
                WebClient MyWebClient = new WebClient();
                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
                Byte[] pageData = MyWebClient.DownloadData(url); //从指定网站下载数据
                pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
            }
            catch (WebException webEx)
            {
                Console.WriteLine(webEx.Message.ToString());
            }
            return pageHtml;
        }

        public static void save(int i, int count, int aa, string urls)
        {
            try
            {
                WebRequest request = WebRequest.Create(urls);//图片src内容
                WebResponse response = request.GetResponse();
                //文件流获取图片操作
                Stream reader = response.GetResponseStream();
                string path = "H://qimg//" + count.ToString() + "//" + aa.ToString() + "//" + i.ToString() + ".jpg";        //图片路径命名 
                FileStream writer = new FileStream(path, FileMode.OpenOrCreate, FileAccess.Write);
                byte[] buff = new byte[512];
                int c = 0;                                           //实际读取的字节数   
                while ((c = reader.Read(buff, 0, buff.Length)) > 0)
                {
                    writer.Write(buff, 0, c);
                }
                //释放资源
                writer.Close();
                writer.Dispose();
                reader.Close();
                reader.Dispose();
                response.Close();
                //下载成功

            }
            catch (Exception msg)
            {
                Console.Write(msg.Message);
                return;
            }
        }

        static void Main(string[] args)
        {
            Queue html = new Queue();//初始化网页链接队列
            Queue img = new Queue();//初始化图片地址队列
            int index = 0;
            string pageHtml = HtmlText(网站主页urls);
            Regex re = new Regex(正则表达式匹配下一级网页链接);
            MatchCollection mc = re.Matches(pageHtml);
            foreach (Match ma in mc)
            {
                string tmp = urls + ma.Value.ToString();
                html.Enqueue(tmp);//入队
            }
            int i = 1;
            while (html.Count > 0)
            {
                if (i < 30)
                {
                    if (i % 2 == 0)
                    {
                        i++;
                        html.Dequeue();
                        continue;
                    }
                }
                else
                    Directory.CreateDirectory("H://qimg//" + i.ToString());//创建目录
                string ss = (string)html.Dequeue();
                for (int aa = 1; aa < 4; aa++)
                {
                    string path = ss + "?" + "sub=" + aa.ToString();
                    string Html = HtmlText(path);
                    Regex res = new Regex(正则表达式匹配图片地址, RegexOptions.None);
                    MatchCollection mcs = res.Matches(Html);
                    foreach (Match mas in mcs)
                    {
                        string a = "https://" + mas.Value.ToString();
                        img.Enqueue(a);
                    }
                    Regex ree = new Regex(正则表达式, RegexOptions.None);//将找到的新的图片html加入队列
                    MatchCollection mce = ree.Matches(Html);
                    foreach (Match mae in mce)
                    {
                        string tmp = "https://" + mae.Value.ToString().Replace("\"","").Replace(">","");//替换字符
                        html.Enqueue(tmp);
                    }
                    Directory.CreateDirectory("H://qimg//" + i.ToString() + "//" + aa.ToString());
                   for(int s=1;img.Count>0;s++)
                    {
                        if (s <= 8)//保存找到的前八张图片
                            save(s, i, aa, (string)img.Dequeue());
                        else
                            img.Dequeue();
                    }
                    Console.WriteLine("ok");
                }
                    html.Dequeue();//保存完当前网页图片,出队
                    i++;
             }
            }
        }
}

写好正则表达式后,程序就会自己不断从网站上爬取图片,保存到本地。控制台可以使程序执行地更快。个人只是初学者,最近要用到才拿来玩玩,怕以后忘记才记录下来。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值