一.引入依赖包
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.3</version>
</dependency>
二.代码
package com.pz998.quartz.spider;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.eclipse.jetty.util.MultiMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;
import com.pz998.rpc.model.entity.BdDepartmentRpc;
import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;
import com.pz998.rpc.model.entity.BdDoctorRpc;
import com.pz998.rpc.model.entity.BdHospitalRpc;
import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
import net.minidev.json.parser.JSONParser;
import net.minidev.json.parser.ParseException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.xsoup.Xsoup;
public class YiBaiduProcessor implements PageProcessor{
private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";
private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";
private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";
private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";
private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";
private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";
private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId=\\d+&hosId=\\d+";
private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";
//https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83
//https://yi.baidu.com/pc/hospital/alldep?key=
private Site site = Site.me();
public static final String STATE_SUCCESS = "0";
public static final Map<String,String> CITY_MAP = new HashMap<String,String>();
static{
CITY_MAP.put("371","武汉");
CITY_MAP.put("1", "北京");
CITY_MAP.put("2",