抓取京东店铺商品信息
webmagic使用maven管理
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.2</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.2</version>
</dependency>
//传店铺url
flag= WebMagic.running(shopUrl);
//京东店铺商品信息有的在页面能直接抓取,有的需要分析有数据的js
//分析js,游览器 f12 f5![]()
//根据页面信息,拼接js
public class WebMagic implements PageProcessor{
static Integer flag ;
public static Integer running(String url) {
Spider.create(new WebMagic())
.addUrl(url)
.addPipeline(new ConsolePipeline())
.thread(5).run();
return flag;
}
public static final String URL_POST ="(http[s]{0,1})://\\w+\\.jd\\.com/view_search-\\S+\\.html";//正则匹配规则
public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";// //自营店铺
public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S";//匹配正则url //
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
public Site getSite() {
return site;
}
public void process(Page page) {
if(page.getUrl().regex(URL_POST).match()){
//获取商品类目categoryId和appId
String categoryId="";
String appId="";
String orderBy="";
String direction="";
String pageSize="";
String pageNo="";
String url=page.getUrl().toString();
String [] sub_url_array = url.split("-");
if (sub_url_array != null && sub_url_array.length >0) {
appId =sub_url_array[1];
categoryId=sub_url_array[2];
orderBy=sub_url_array[3];
direction=sub_url_array[4];
pageSize =sub_url_array[5];
//pageNo=sub_url_array[6];
pageNo=sub_url_array[6].replaceAll(".html", "");
}
page.putField("pageInstanceId",page.getHtml().xpath("//[@id='pageInstance_id']/@value").all());
page.putField("venderId",page.getHtml().xpath("//[@id='vender_id']/@value").all());
page.putField("instanceid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure']/@m_render_instance_id").all());
page.putField("prototypeid",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_prototype_id").all());
page.putField("templateId",page.getHtml().xpath("//div[@class='J_LayoutWrap d-layout-wrap d-enable d-w990']/div/div[2]/div[@class='m_render_structure loading']/@m_render_template_id").all());
page.putField("shopId",page.getHtml().xpath("//[@id='shop_id']/@value").all());
List<String> pageInstanceIds = (List<String>) page.getResultItems().get("pageInstanceId");
List<String> venderIds = (List<String>) page.getResultItems().get("venderId");
List<String> instanceids=(List<String>) page.getResultItems().get("instanceid");
List<String> prototypeids=(List<String>) page.getResultItems().get("prototypeid");
List<String> templateIds=(List<String>) page.getResultItems().get("templateId");
List<String> shopIds=(List<String>) page.getResultItems().get("shopId");
String pageInstanceId="";
String venderId="";
String instanceid="";
String prototypeid="";
String templateId="";
String shopId="";
if (pageInstanceIds != null && pageInstanceIds.size()>0) {
pageInstanceId=pageInstanceIds.get(0);
venderId=venderIds.get(0);
instanceid=instanceids.get(0);
prototypeid=prototypeids.get(0);
templateId=templateIds.get(0);
shopId=shopIds.get(0);
}
//当前时间戳获取
String res;
Date date = new Date();
long ts = date.getTime();
res = String.valueOf(ts);
for (int i = 1; i <5; i++) {
String surl="";
surl="http://module-jshop.jd.com/module/getModuleHtml.html?appId="+appId+"&orderBy="+orderBy+"&pageNo="+i+"&direction="+direction+"&categoryId="+categoryId+"&pageSize="+pageSize+"&pagePrototypeId=8&pageInstanceId="+pageInstanceId+"&moduleInstanceId="+instanceid+"&prototypeId="+prototypeid+"&templateId="+templateId+"&layoutInstanceId="+instanceid+"&origin=0&shopId="+shopId+"&venderId="+venderId+"&callback=jshop_module_render_callback&_="+res;
JDItemJsonPreocessor.running(surl);
}
flag=200;
}
}
}
//获取商品数据,价格sku,名称等信息
static Map<String,String> maps = new HashMap<String, String>();
private PageInfo pages = new PageInfo();
private ShopItem shopitem;
private ShopInfo shopinfo;
private List<ShopInfo> shopInfolist;//店铺信息
private List<ShopItem> shopItemlist;
public static Map<String,String> running(String url) {
Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
return maps;
}
private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml");
ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");
public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
//public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
static Map<String,String> maps = new HashMap<String, String>();
private PageInfo pages = new PageInfo();
private ShopItem shopitem;
private ShopInfo shopinfo;
private List<ShopInfo> shopInfolist;//店铺信息
private List<ShopItem> shopItemlist;
public static Map<String,String> running(String url) {
Spider.create(new JDItemJsonPreocessor()).addUrl(url).run();
return maps;
}
private ApplicationContext a=new ClassPathXmlApplicationContext("spring/applicationContext-db.xml");
ShopInfoService shopInfoService=(ShopInfoService) a.getBean("ShopInfoServiceImpl");
ShopItemService shopItemService=(ShopItemService) a.getBean("ShopItemServiceImpl");
public static final String URL_LIST = "(http[s]{0,1})://module-jshop\\.jd\\.com/module/getModuleHtml\\.html\\?[\\w-_/?&=#%:]*";
//public static final String URL_ADVANCE ="(http[s]{0,1})://\\w+\\.jd\\.com/\\S+\\.html";//匹配正则url
public static final String URL_ADVANCE ="(http[s]{0,1})://mall.jd\\.com/\\S+\\.html";//匹配正则url
public static final String URL_ZIYING ="https://list.jd.com/list.html?\\w";//匹配正则url
page.putField("id",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[3]/div/span[2]/@jdprice").all());
page.putField("name",page.getHtml().xpath("//div/div/div/div[2]/ul/li/div/div[3]/div[2]/a/text()").all());
page.putField("img",page.getHtml().xpath("//div/div/div/ul/li/div/div[1]/a/img/@original").all());
List<String> ids = (List<String>) page.getResultItems().get("id");
List<String> name = (List<String>) page.getResultItems().get("name");
List<String> imgs=(List<String>) page.getResultItems().get("img");
String makerUrl = makerUrl(ids);
Map<String, String> running = JDJsonPreocessor.running(makerUrl);//拼接价格js
for (int i = 0; i < name.size(); i++) {
String price = running.get("J_"+ids.get(i));
String ItemId=ids.get(i);
String productname =name.get(i);
String pImg="";
pImg="http:"+imgs.get(i).replaceAll("\\\\\"", "");
public String makerUrl(List<String> ids){
StringBuffer sb = new StringBuffer();
for (String id : ids) {
sb.append("J_"+id+",");
}
String substring = sb.substring(0, sb.length()-1);
//获取时间戳
String res;
Date date = new Date();
long ts = date.getTime();
res = String.valueOf(ts);
return "http://p.3.cn/prices/mgets?callback=jQuery3944635&skuIds="+substring+"&_="+res;
}
//获取价格信息
package com.huanovo.fxprice.service.impl;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.huanovo.fxprice.util.JsonUtil;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class JDJsonPreocessor implements PageProcessor{
static Map<String,String> maps = new HashMap<String, String>();
public static Map<String,String> running(String url) {
Spider.create(new JDJsonPreocessor()).addUrl(url).run();
return maps;
}
private Site site = Site.me()
.setRetryTimes(3)
.setSleepTime(100)
.addHeader("Accept-Encoding", "/")
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");
public Site getSite() {
return site;
}
public void process(Page page) {
page.setSkip(true);
String text = page.getRawText();
int begin = text.indexOf("[");
int end = text.indexOf("]");
String substring = text.substring(begin, end + 1);
String jsonName = "result";
String json = "{\"" + jsonName + "\":" + substring + "}";
Map<String, Object> map = JsonUtil.jsonToMap(json);
List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);
for (Map<String, Object> map1 : list) {
String key = map1.get("id").toString();
String value = map1.get("p").toString();
maps.put(key, value);
}
}
}
主要就是 ,1.拿到链接分析页面数据 2.模拟链接访问3.xpath提取页面信息over