c#爬取某东商品图片和价格

最新推荐文章于 2024-05-07 12:57:25 发布

惊艳了旧时光

最新推荐文章于 2024-05-07 12:57:25 发布

阅读量603

点赞数

分类专栏：爬虫文章标签：大数据 c# visual studio

本文链接：https://blog.youkuaiyun.com/m0_38046697/article/details/113742733

版权

爬虫专栏收录该内容

1 篇文章

订阅专栏

c#爬取某东商品图片和价格

最近业务需要，需要做一份市场调研，又懒得一点一点自己导入，于是灵机一动，打算用c#写一份爬虫，帮我搜索某东（tb需要登录，怕被封号哈哈）的商品数据，并整理成excel表，本篇主要是爬虫部分，有时间会更新excel部分。
没打算调用某东的api直接是发起http请求。
首先是c#的httprequest

HttpWebRequest webRequest = (HttpWebRequest)WebRequest.CreateHttp(url);
            webRequest.Method = "GET";
            webRequest.UserAgent = " Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0";
            var webResponse = webRequest.GetResponse();
            StreamReader streamReader = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8);
            string str = streamReader.ReadToEnd();
            streamReader.Close();

获取到的数据是html格式，然后用正则表达式分离出需要的数据

Regex regex_Business = new Regex("(?<=style=\"display:block;\">)([\\S\\s]*?)(?=</li>)");
            MatchCollection match_Business = regex_Business.Matches(str);

            //分析链接地址
            Regex regex_BusinessURL = new Regex("(?<=<a href=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_BusinessURL;

            //分析厂家图片链接
            Regex regex_BusinessLogoURL = new Regex("(?<=<img src=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_BusinessLogoURL;

            //分析厂家名称
            Regex regex_BusinessName = new Regex("(?<=title=\")([\\S\\s]*?)(?=\">)");
            MatchCollection match_BusinessName;

调用分离各项数据
先定义一个client
WebClient client = new WebClient();

match_BusinessURL = regex_BusinessURL.Matches(match.Groups[1].Value);
                        match_BusinessLogoURL = regex_BusinessLogoURL.Matches(match.Groups[1].Value);
                        match_BusinessName = regex_BusinessName.Matches(match.Groups[1].Value);



                        //商家名字
                        foreach (Match match1 in match_BusinessName)
                        {
                            merchandiseValue.MerchantsName = match1.Groups[1].Value;
                            if (merchandiseValue.MerchantsName == "")
                            {
                                break;
                            }
                        }
                        //商家logo链接 会在桌面生成商家logo图片
                        foreach (Match match1 in match_BusinessLogoURL)
                        {
                            string src = "http:" + match1.Groups[1].Value;
                            if (src.Contains("http") && !src.Contains(".svg"))
                            {
                                if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg"))
                                {
                                    File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
                                }
                                client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
                            }
                        }


                        //商家链接位置
                        foreach (Match match1 in match_BusinessURL)
                        {
                            merchandiseValue.MerchantsLine = "https://search.jd.com/" + match1.Groups[1].Value;
                        }

要是多个商品的话处理商品的时候最好延时一下，不然太快某东会屏蔽IP
System.Threading.Thread.Sleep(5000); //毫秒延时模拟人操作
上面的页面是获取商家的信息，然后是商家具体商品的操作

//截取商品ID
            Regex regex_ID = new Regex("(?<=<li data-sku=\")([\\S\\s]*?)(?=\")");
            MatchCollection match_ID = regex_ID.Matches(str);//str是商家的某个商品页的链接获取到的html数据，操作类似前面的httprequset请求
            try
            {
                string sts = "";
                foreach (Match match in match_ID)
                {
                    merchandiseValue.FristID = match.Groups[1].Value.Trim();//去除空字符;
                    sts = "https://item.jd.com/" + match.Groups[1].Value.Trim() + ".html";
                    merchandiseValue.CommodityLine = sts;
                    HttpGetHandle_Business_SingleItemContent(sts, wSheet);//这个是截取具体商品的函数，wSheet是excel的某个Sheet对象
                    merchandiseValue.Clear();
                    System.Threading.Thread.Sleep(2000); //毫秒 延时1秒 模拟人操作
                    break;
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("-------------" + ex);
            }

然后是获取具体商品的信息

///截取包含商品ID、接口信息、图片的信息
            Regex regex_ModelNumberFrist = new Regex("(?<=sku-name\")([\\S\\s]*?)(?=class=\"news\")");
            MatchCollection match_ModelNumberFrist = regex_ModelNumberFrist.Matches(str);//str同上，是具体商品的html数据

            Regex regex_ModelNumber = new Regex("(?<=>)([\\S\\s]*?)(?=</div>)");
            MatchCollection match_ModelNumber;
            RegexHelper.match_ColorSize = RegexHelper.regex_ColorSize.Matches(str);
            RegexHelper.match_PictureFrist = RegexHelper.regex_PictureFrist.Matches(str);

获取商品型号

foreach (Match match in match_ModelNumberFrist)
                {
                    match_ModelNumber = regex_ModelNumber.Matches(match.Groups[1].Value);
                    foreach (Match match1 in match_ModelNumber)
                    {
                        string[] sArray = match1.Groups[1].Value.Split(' ');
                        foreach(string sts in sArray)
                        {
                            if (sts.Contains("-"))
                            {
                                Regex reg = new Regex(@"[\u4e00-\u9fa5]");
                                merchandiseValue.ModelNumber = reg.Replace(sts, "");
                                merchandiseValue.ModelNumber = merchandiseValue.ModelNumber.Replace("（", "(").Replace("）", ")");
                                merchandiseValue.ModelNumber = Regex.Replace(merchandiseValue.ModelNumber.Replace("（", "(").Replace("）", ")"), @"\([^\(]*\)", "");
                                break;
                            }
                        }
                    }
                }

获取商品的图片，前面是商家logo图片

foreach (Match match in RegexHelper.match_PictureFrist)
                {
                    RegexHelper.match_Picture = RegexHelper.regex_Picture.Matches(match.Groups[1].Value);
                    if (count == 2)
                        foreach (Match match1 in RegexHelper.match_Picture)
                        {
                            string src = "http:" + match1.Groups[1].Value;
                            if (src.Contains("http") && !src.Contains(".svg"))
                            {
                                if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg"))
                                {
                                    File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
                                }
                                client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
                            }
                        }
                }

最后是商品价格

foreach (Match match2 in RegexHelper.match_ID)
                    {
                        merchandiseValue.ID = match2.Groups[1].Value;
                        RegexHelper.match_Price = RegexHelper.regex_Price.Matches(HttpCrawelHelper.GetData("https://p.3.cn/prices/mgets?skuIds=J_" + merchandiseValue.ID, Encoding.Default));
                        foreach (Match match3 in RegexHelper.match_Price)
                        {
                            merchandiseValue.MarketPrice = match3.Groups[1].Value;
                        }
                    }