c#爬取某东商品图片和价格
最近业务需要,需要做一份市场调研,又懒得一点一点自己导入,于是灵机一动,打算用c#写一份爬虫,帮我搜索某东(tb需要登录,怕被封号哈哈)的商品数据,并整理成excel表,本篇主要是爬虫部分,有时间会更新excel部分。
没打算调用某东的api直接是发起http请求。
首先是c#的httprequest
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.CreateHttp(url);
webRequest.Method = "GET";
webRequest.UserAgent = " Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0";
var webResponse = webRequest.GetResponse();
StreamReader streamReader = new StreamReader(webResponse.GetResponseStream(), Encoding.UTF8);
string str = streamReader.ReadToEnd();
streamReader.Close();
获取到的数据是html格式,然后用正则表达式分离出需要的数据
Regex regex_Business = new Regex("(?<=style=\"display:block;\">)([\\S\\s]*?)(?=</li>)");
MatchCollection match_Business = regex_Business.Matches(str);
//分析链接地址
Regex regex_BusinessURL = new Regex("(?<=<a href=\")([\\S\\s]*?)(?=\")");
MatchCollection match_BusinessURL;
//分析厂家图片链接
Regex regex_BusinessLogoURL = new Regex("(?<=<img src=\")([\\S\\s]*?)(?=\")");
MatchCollection match_BusinessLogoURL;
//分析厂家名称
Regex regex_BusinessName = new Regex("(?<=title=\")([\\S\\s]*?)(?=\">)");
MatchCollection match_BusinessName;
调用分离各项数据
先定义一个client
WebClient client = new WebClient();
match_BusinessURL = regex_BusinessURL.Matches(match.Groups[1].Value);
match_BusinessLogoURL = regex_BusinessLogoURL.Matches(match.Groups[1].Value);
match_BusinessName = regex_BusinessName.Matches(match.Groups[1].Value);
//商家名字
foreach (Match match1 in match_BusinessName)
{
merchandiseValue.MerchantsName = match1.Groups[1].Value;
if (merchandiseValue.MerchantsName == "")
{
break;
}
}
//商家logo链接 会在桌面生成商家logo图片
foreach (Match match1 in match_BusinessLogoURL)
{
string src = "http:" + match1.Groups[1].Value;
if (src.Contains("http") && !src.Contains(".svg"))
{
if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg"))
{
File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
}
client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\logo.jpg");
}
}
//商家链接位置
foreach (Match match1 in match_BusinessURL)
{
merchandiseValue.MerchantsLine = "https://search.jd.com/" + match1.Groups[1].Value;
}
要是多个商品的话处理商品的时候最好延时一下,不然太快某东会屏蔽IP
System.Threading.Thread.Sleep(5000); //毫秒 延时 模拟人操作
上面的页面是获取商家的信息,然后是商家具体商品的操作
//截取商品ID
Regex regex_ID = new Regex("(?<=<li data-sku=\")([\\S\\s]*?)(?=\")");
MatchCollection match_ID = regex_ID.Matches(str);//str是商家的某个商品页的链接获取到的html数据,操作类似前面的httprequset请求
try
{
string sts = "";
foreach (Match match in match_ID)
{
merchandiseValue.FristID = match.Groups[1].Value.Trim();//去除空字符;
sts = "https://item.jd.com/" + match.Groups[1].Value.Trim() + ".html";
merchandiseValue.CommodityLine = sts;
HttpGetHandle_Business_SingleItemContent(sts, wSheet);//这个是截取具体商品的函数,wSheet是excel的某个Sheet对象
merchandiseValue.Clear();
System.Threading.Thread.Sleep(2000); //毫秒 延时1秒 模拟人操作
break;
}
}
catch (Exception ex)
{
Console.WriteLine("-------------" + ex);
}
然后是获取具体商品的信息
///截取包含商品ID、接口信息、图片的信息
Regex regex_ModelNumberFrist = new Regex("(?<=sku-name\")([\\S\\s]*?)(?=class=\"news\")");
MatchCollection match_ModelNumberFrist = regex_ModelNumberFrist.Matches(str);//str同上,是具体商品的html数据
Regex regex_ModelNumber = new Regex("(?<=>)([\\S\\s]*?)(?=</div>)");
MatchCollection match_ModelNumber;
RegexHelper.match_ColorSize = RegexHelper.regex_ColorSize.Matches(str);
RegexHelper.match_PictureFrist = RegexHelper.regex_PictureFrist.Matches(str);
获取商品型号
foreach (Match match in match_ModelNumberFrist)
{
match_ModelNumber = regex_ModelNumber.Matches(match.Groups[1].Value);
foreach (Match match1 in match_ModelNumber)
{
string[] sArray = match1.Groups[1].Value.Split(' ');
foreach(string sts in sArray)
{
if (sts.Contains("-"))
{
Regex reg = new Regex(@"[\u4e00-\u9fa5]");
merchandiseValue.ModelNumber = reg.Replace(sts, "");
merchandiseValue.ModelNumber = merchandiseValue.ModelNumber.Replace("(", "(").Replace(")", ")");
merchandiseValue.ModelNumber = Regex.Replace(merchandiseValue.ModelNumber.Replace("(", "(").Replace(")", ")"), @"\([^\(]*\)", "");
break;
}
}
}
}
获取商品的图片,前面是商家logo图片
foreach (Match match in RegexHelper.match_PictureFrist)
{
RegexHelper.match_Picture = RegexHelper.regex_Picture.Matches(match.Groups[1].Value);
if (count == 2)
foreach (Match match1 in RegexHelper.match_Picture)
{
string src = "http:" + match1.Groups[1].Value;
if (src.Contains("http") && !src.Contains(".svg"))
{
if (File.Exists(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg"))
{
File.Delete(@"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
}
client.DownloadFile(src, @"C:\Users\Administrator\Desktop\爬虫学习\CommodityImage.jpg");
}
}
}
最后是商品价格
foreach (Match match2 in RegexHelper.match_ID)
{
merchandiseValue.ID = match2.Groups[1].Value;
RegexHelper.match_Price = RegexHelper.regex_Price.Matches(HttpCrawelHelper.GetData("https://p.3.cn/prices/mgets?skuIds=J_" + merchandiseValue.ID, Encoding.Default));
foreach (Match match3 in RegexHelper.match_Price)
{
merchandiseValue.MarketPrice = match3.Groups[1].Value;
}
}
技术大概就这么多,具体操作的内容,怎么筛选需要自行更改代码逻辑
excel的操作下一篇更新