c#爬取某东商品图片和价格

c#爬取某东商品图片和价格

最近业务需要,需要做一份市场调研,又懒得一点一点自己导入,于是灵机一动,打算用c#写一份爬虫,帮我搜索某东(tb需要登录,怕被封号哈哈)的商品数据,并整理成excel表,本篇主要是爬虫部分,有时间会更新excel部分。
没打算调用某东的api直接是发起http请求。
首先是c#的httprequest

HttpWebRequest webRequest = (HttpWebRequest)WebRequest.CreateHttp(url);
            webRequest.Method = "GET";
            webRequest.UserAgent = " Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0";
            var webResponse = webRequest.GetResponse();
            StreamReader streamReader = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8);
            string str = streamReader.ReadToEnd();
            streamReader.Close();

获取到的数据是html格式,然后用正则表达式分离出需要的数据

Regex regex_Business = new Regex("(?<=style=\"display:block;\">)([\\S\\s]*?)(?=</li>)");
            MatchCollection match_Business = regex_Business.Matches(str);

            //分析链接地址
            Regex regex_BusinessURL = new Regex("(?<=<a href=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_BusinessURL;

            //分析厂家图片链接
            Regex regex_BusinessLogoURL = new Regex("(?<=<img src=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_BusinessLogoURL;

            //分析厂家名称
            Regex regex_BusinessName = new Regex("(?<=title=\")([\\S\\s]*?)(?=\">)");
            MatchCollection match_BusinessName;

调用分离各项数据
先定义一个client
WebClient client = new WebClient();

match_BusinessURL = regex_BusinessURL.Matches(match.Groups[1].Value);
                        match_BusinessLogoURL = regex_BusinessLogoURL.Matches(match.Groups[1].Value);
                        match_BusinessName = regex_BusinessName.Matches(match.Groups[1].Value);



                        //商家名字
                        foreach (Match match1 in match_BusinessName)
                        {
                            merchandiseValue.MerchantsName = match1.Groups[1].Value;
                            if (merchandiseValue.MerchantsName == "")
                            {
                                break;
                            }
                        }
                        //商家logo链接 会在桌面生成商家logo图片
                        foreach (Match match1 in match_BusinessLogoURL)
                        {
                            string src = "http:" + match1.Groups[1].Value;
                            if (src.Contains("http") && !src.Contains(".svg"))
                            {
                                if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg"))
                                {
                                    File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
                                }
                                client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
                            }
                        }


                        //商家链接位置
                        foreach (Match match1 in match_BusinessURL)
                        {
                            merchandiseValue.MerchantsLine = "https://search.jd.com/" + match1.Groups[1].Value;
                        }

要是多个商品的话处理商品的时候最好延时一下,不然太快某东会屏蔽IP
System.Threading.Thread.Sleep(5000); //毫秒 延时 模拟人操作
上面的页面是获取商家的信息,然后是商家具体商品的操作

//截取商品ID
            Regex regex_ID = new Regex("(?<=<li data-sku=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_ID = regex_ID.Matches(str);//str是商家的某个商品页的链接获取到的html数据,操作类似前面的httprequset请求
            try
            {
                string sts = "";
                foreach (Match match in match_ID)
                {
                    merchandiseValue.FristID = match.Groups[1].Value.Trim();//去除空字符;
                    sts = "https://item.jd.com/" + match.Groups[1].Value.Trim() + ".html";
                    merchandiseValue.CommodityLine = sts;
                    HttpGetHandle_Business_SingleItemContent(sts, wSheet);//这个是截取具体商品的函数,wSheet是excel的某个Sheet对象
                    merchandiseValue.Clear();
                    System.Threading.Thread.Sleep(2000); //毫秒 延时1秒 模拟人操作
                    break;
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("-------------" + ex);
            }

然后是获取具体商品的信息

///截取包含商品ID、接口信息、图片的信息
            Regex regex_ModelNumberFrist = new Regex("(?<=sku-name\")([\\S\\s]*?)(?=class=\"news\")");
            MatchCollection match_ModelNumberFrist = regex_ModelNumberFrist.Matches(str);//str同上,是具体商品的html数据

            Regex regex_ModelNumber = new Regex("(?<=>)([\\S\\s]*?)(?=</div>)");
            MatchCollection match_ModelNumber;
            RegexHelper.match_ColorSize = RegexHelper.regex_ColorSize.Matches(str);
            RegexHelper.match_PictureFrist = RegexHelper.regex_PictureFrist.Matches(str);

获取商品型号

foreach (Match match in match_ModelNumberFrist)
                {
                    match_ModelNumber = regex_ModelNumber.Matches(match.Groups[1].Value);
                    foreach (Match match1 in match_ModelNumber)
                    {
                        string[] sArray = match1.Groups[1].Value.Split(' ');
                        foreach(string sts in sArray)
                        {
                            if (sts.Contains("-"))
                            {
                                Regex reg = new Regex(@"[\u4e00-\u9fa5]");
                                merchandiseValue.ModelNumber = reg.Replace(sts, "");
                                merchandiseValue.ModelNumber = merchandiseValue.ModelNumber.Replace("(", "(").Replace(")", ")");
                                merchandiseValue.ModelNumber = Regex.Replace(merchandiseValue.ModelNumber.Replace("(", "(").Replace(")", ")"), @"\([^\(]*\)", "");
                                break;
                            }
                        }
                    }
                }

获取商品的图片,前面是商家logo图片

foreach (Match match in RegexHelper.match_PictureFrist)
                {
                    RegexHelper.match_Picture = RegexHelper.regex_Picture.Matches(match.Groups[1].Value);
                    if (count == 2)
                        foreach (Match match1 in RegexHelper.match_Picture)
                        {
                            string src = "http:" + match1.Groups[1].Value;
                            if (src.Contains("http") && !src.Contains(".svg"))
                            {
                                if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg"))
                                {
                                    File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
                                }
                                client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
                            }
                        }
                }

最后是商品价格

foreach (Match match2 in RegexHelper.match_ID)
                    {
                        merchandiseValue.ID = match2.Groups[1].Value;
                        RegexHelper.match_Price = RegexHelper.regex_Price.Matches(HttpCrawelHelper.GetData("https://p.3.cn/prices/mgets?skuIds=J_" + merchandiseValue.ID, Encoding.Default));
                        foreach (Match match3 in RegexHelper.match_Price)
                        {
                            merchandiseValue.MarketPrice = match3.Groups[1].Value;
                        }
                    }

技术大概就这么多,具体操作的内容,怎么筛选需要自行更改代码逻辑
excel的操作下一篇更新

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值