webhave爬虫抓取京东数据

本文介绍了一种利用WebMagic框架抓取京东店铺商品信息的方法,包括配置Maven依赖、定义爬虫逻辑及提取商品详情等内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

抓取京东店铺商品信息

webmagic使用maven管理

<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.2</version>
</dependency>
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.2</version>
</dependency>
//传店铺url
flag= WebMagic.running(shopUrl);
//京东店铺商品信息有的在页面能直接抓取,有的需要分析有数据的js
//分析js,游览器 f12  f5
//根据页面信息,拼接js
public class WebMagic implements PageProcessor{
	static Integer flag ;
	public static Integer running(String url) {


		Spider.create(new WebMagic())
		.addUrl(url)
		.addPipeline(new ConsolePipeline())
		.thread(5).run();	
		return flag;
	}


	public static final String URL_POST ="(http[s]{0,1})://\\w+\\.jd\\.com/view_search-\\S+\\.html";//正则匹配规则
	public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";// //自营店铺




	public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S";//匹配正则url	 //


	private Site site = Site.me().setRetryTimes(3).setSleepTime(100);


	public Site getSite() {
		return site;
	}


	public void process(Page page) {
		if(page.getUrl().regex(URL_POST).match()){
			//获取商品类目categoryId和appId
			String categoryId="";
			String appId="";
			String orderBy="";
			String direction="";
			String pageSize="";
			String pageNo="";
			String url=page.getUrl().toString();
			String [] sub_url_array = url.split("-");  
			if (sub_url_array != null && sub_url_array.length >0) {
				
					appId =sub_url_array[1];
					categoryId=sub_url_array[2];
					orderBy=sub_url_array[3];
					direction=sub_url_array[4];
					pageSize =sub_url_array[5];
					//pageNo=sub_url_array[6];
					pageNo=sub_url_array[6].replaceAll(".html", "");
				


			}
			page.putField("pageInstanceId",page.getHtml().xpath("//[@id='pageInstance_id']/@value").all());
			page.putField("venderId",page.getHtml().xpath("//[@id='vender_id']/@value").all());
		
				page.putField("instanceid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure']/@m_render_instance_id").all());
				page.putField("prototypeid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_prototype_id").all());
				page.putField("templateId",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_template_id").all());


			


			page.putField("shopId",page.getHtml().xpath("//[@id='shop_id']/@value").all());


			List<String> pageInstanceIds = (List<String>) page.getResultItems().get("pageInstanceId");
			List<String> venderIds = (List<String>) page.getResultItems().get("venderId");


			List<String> instanceids=(List<String>) page.getResultItems().get("instanceid");
			List<String> prototypeids=(List<String>) page.getResultItems().get("prototypeid");
			List<String> templateIds=(List<String>) page.getResultItems().get("templateId");




			List<String> shopIds=(List<String>) page.getResultItems().get("shopId");


			String pageInstanceId="";
			String venderId="";
			String instanceid="";
			String prototypeid="";
			String templateId="";
			String shopId="";


			if (pageInstanceIds != null && pageInstanceIds.size()>0) {
				pageInstanceId=pageInstanceIds.get(0);
				venderId=venderIds.get(0);
				instanceid=instanceids.get(0);
				prototypeid=prototypeids.get(0);
				templateId=templateIds.get(0);
				
				shopId=shopIds.get(0);
			}


			//当前时间戳获取
			String res;
			Date date = new Date();
			long ts = date.getTime();
			res = String.valueOf(ts);


			for (int i = 1; i <5; i++) {
				String surl="";
				
					surl="http://module-jshop.jd.com/module/getModuleHtml.html?appId="+appId+"&orderBy="+orderBy+"&pageNo="+i+"&direction="+direction+"&categoryId="+categoryId+"&pageSize="+pageSize+"&pagePrototypeId=8&pageInstanceId="+pageInstanceId+"&moduleInstanceId="+instanceid+"&prototypeId="+prototypeid+"&templateId="+templateId+"&layoutInstanceId="+instanceid+"&origin=0&shopId="+shopId+"&venderId="+venderId+"&callback=jshop_module_render_callback&_="+res;
				
				JDItemJsonPreocessor.running(surl);
			}
			flag=200; 
		}
	}
	
}
//获取商品数据,价格sku,名称等信息
static Map<String,String> maps = new HashMap<String, String>();
		private PageInfo pages = new PageInfo();
		private ShopItem shopitem;
		private ShopInfo shopinfo;
		private List<ShopInfo> shopInfolist;//店铺信息
		private List<ShopItem> shopItemlist;
		    
	    public static Map<String,String> running(String url) {
	    	
	        Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
	        return maps;
	    }


	   private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
	    ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
	    ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");


	    public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
	    //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
	    public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
	    public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url	

static Map<String,String> maps = new HashMap<String, String>();
		private PageInfo pages = new PageInfo();
		private ShopItem shopitem;
		private ShopInfo shopinfo;
		private List<ShopInfo> shopInfolist;//店铺信息
		private List<ShopItem> shopItemlist;
		    
	    public static Map<String,String> running(String url) {
	    	
	        Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
	        return maps;
	    }


	   private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml"); 
	    ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
	    ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");


	    public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
	    //public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
	    public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
	    public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url	

page.putField("id",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[3]/div/span[2]/@jdprice").all());
							page.putField("name",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[2]/a/text()").all());
							page.putField("img",page.getHtml().xpath("//div/div/div/ul/li/div/div[1]/a/img/@original").all());
						
	 		  List<String> ids = (List<String>) page.getResultItems().get("id");
		      List<String> name = (List<String>) page.getResultItems().get("name");
		      List<String> imgs=(List<String>) page.getResultItems().get("img");
		      
 	      String makerUrl = makerUrl(ids);
	      Map<String, String> running = JDJsonPreocessor.running(makerUrl);//拼接价格js
	      for (int i = 0; i < name.size(); i++) {
	       String price = running.get("J_"+ids.get(i));
	    	  String ItemId=ids.get(i);
	    	  String productname =name.get(i);
	    	  String pImg="";
	    	  pImg="http:"+imgs.get(i).replaceAll("\\\\\"", "");
 public String makerUrl(List<String> ids){
	    	  StringBuffer sb = new StringBuffer();
	    	  for (String id : ids) {
	    	   sb.append("J_"+id+",");
	    	  }
	    	  String substring = sb.substring(0, sb.length()-1);
	    	  //获取时间戳
	     	   String res;
	           Date date = new Date();
	           long ts = date.getTime();
	           res = String.valueOf(ts);
	    	  return "http://p.3.cn/prices/mgets?callback=jQuery3944635&skuIds="+substring+"&_="+res;
	    	 }
//获取价格信息
package com.huanovo.fxprice.service.impl;


import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


import com.huanovo.fxprice.util.JsonUtil;




import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;




public class JDJsonPreocessor implements PageProcessor{
	 static Map<String,String> maps = new HashMap<String, String>();
	    
	    
	    public static Map<String,String> running(String url) {


	        Spider.create(new JDJsonPreocessor()).addUrl(url).run();
	        return maps;
	    }


	    private Site site = Site.me()
	    .setRetryTimes(3)
	    .setSleepTime(100)
	    .addHeader("Accept-Encoding", "/")
	    .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");


	    public Site getSite() {
	        return site;
	    }


	    public void process(Page page) {
	        page.setSkip(true);
	        String text = page.getRawText();
	        int begin = text.indexOf("[");
	        int end = text.indexOf("]");
	        String substring = text.substring(begin, end + 1);
	        String jsonName = "result";
	        String json = "{\"" + jsonName + "\":" + substring + "}";


	        Map<String, Object> map = JsonUtil.jsonToMap(json);
	        List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);
	        for (Map<String, Object> map1 : list) {
	            String key = map1.get("id").toString();
	            String value = map1.get("p").toString();
	            maps.put(key, value);
	        }
	    }
}

主要就是 ,1.拿到链接分析页面数据 2.模拟链接访问3.xpath提取页面信息over


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值