package com.microbekb.crawler.cnki; import com.microbekb.crawler.cnki.jpa.CnkiSpacePaper; import com.microbekb.crawler.cnki.jpa.CnkiSpacePaperRepository; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import java.util.ArrayList; import java.util.HashSet; import java.util.List; /** * Created by DELL on 2017/2/17. */ @Component public class CnkiConferencePaperCrawler implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(500); //设置参数 private static HashSet<String> urls=new HashSet<>(); private static final org.slf4j.Logger log = LoggerFactory.getLogger(CnkiConferencePaperCrawler.class); @Autowired private CnkiSpacePaperRepository cnkiSpacePaperRepository; //用于与数据库关联,通过spring注入 @Autowired private CnkiConferencePaperPipeline cnkiJournalPaperPipeline; //用于处理数据,通过spring注入 @Override public void process(Page page) { //需要爬取的字段 String url= page.getUrl().toString(); String title_cn= ""; String title_en= ""; String organization = ""; String abstract_cn= ""; String abstract_en= ""; String proceedings_name= ""; String conference_name= ""; String date= ""; String place= ""; String code= ""; String organizor= ""; String foundation=""; String authors_cn = ""; String authors_en = ""; //#chTitle title_cn = page.getHtml().xpath("//span[@id='chTitle']/text()").toString(); title_en = page.getHtml().xpath("//span[@id='enTitle']/text()").toString(); List<Selectable> pList = page.getHtml().xpath("//div[@class='summary']/p").nodes(); for(Selectable p:pList){ //System.out.println(p.toString()); if(p.toString().contains("作者")){ List<String> author_cns = new ArrayList<>(); author_cns = p.xpath("//a[@class='KnowledgeNetLink']/text()").all(); for(String str:author_cns){ authors_cn = authors_cn+str+";"; } } if(p.toString().contains("机构")){ List<String> organizations = new ArrayList<>(); organizations = p.xpath("//a[@class='KnowledgeNetLink']/text()").all(); for(String str:organizations){ organization = organization+str+";"; } } if(p.toString().contains("摘要")){ abstract_cn = p.xpath("//span/text()").toString(); } } //#content > div:nth-child(1) > div:nth-child(5) > ul:nth-child(1) > li > a proceedings_name = page.getHtml().xpath("//div[@class='summary']/ul/li/a/text()").toString(); List<String> itemStrList = page.getHtml().xpath("//div[@class='summary']/ul/li/text()").all(); for(String str:itemStrList){ if(str.contains("会议名称")){ conference_name = str.replaceAll("【会议名称】","").trim(); } if(str.contains("会议时间")){ date = str.replaceAll("【会议时间】","").trim(); } if(str.contains("会议地点")){ place = str.replaceAll("【会议地点】","").trim(); } if(str.contains("分类号")){ code = str.replaceAll("【分类号】","").trim(); } if(str.contains("主办单位")){ organizor = str.replaceAll("【主办单位】","").trim(); } } foundation = page.getHtml().xpath("//div[@class='summary']/div[@class='keywords']/text()").toString(); page.putField("url",url); page.putField("title_cn",title_cn); page.putField("title_en",title_en); page.putField("organization",organization); page.putField("abstract_cn",abstract_cn); page.putField("abstract_en",abstract_en); page.putField("proceedings_name",proceedings_name); page.putField("conference_name",conference_name); page.putField("date",date); page.putField("place",place); page.putField("code",code); page.putField("organizor",organizor); page.putField("foundation",foundation); page.putField("authors_cn",authors_cn); page.putField("authors_en",authors_en); } @Override public Site getSite() { return site; }
//测试url,初始url地址,可根据需要修改 private void addUrlTest(){ urls.add("http://www.cnki.net/KCMS/detail/detail.aspx?dbcode=CPFD&filename=KAXH201012001028"); }
//调用此方法,为爬虫入口 public void start(){ //CnkiJournalPaperCrawler cnkiJournalPaperCrawler = new CnkiJournalPaperCrawler(); addTest(); //addUrlText(); Spider spider = Spider.create(this); for(String url:urls){ spider = spider.addUrl(url); } try{ spider.thread(1).addPipeline(cnkiJournalPaperPipeline).run(); }catch (Exception e){ }
}
package com.microbekb.crawler.cnki; import com.microbekb.crawler.cnki.jpa.CnkiConferencePaper; import com.microbekb.crawler.cnki.jpa.CnkiConferencePaperRepository; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.ComponentScan; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; /** * Created by DELL on 2017/2/17. */ @Component @ComponentScan public class CnkiConferencePaperPipeline implements Pipeline { public static int count=0; @Autowired CnkiConferencePaperRepository cnkiConferencePaperRepository; //用于存储数据,通过spring注入 @Override public void process(ResultItems resultItems, Task task) { String url= ""; String title_cn= ""; String title_en= ""; String organization = ""; String abstract_cn= ""; String abstract_en= ""; String proceedings_name= ""; String conference_name= ""; String date= ""; String place= ""; String code= ""; String organizor= ""; String foundation=""; String authors_cn = ""; String authors_en = ""; url = resultItems.get("url"); title_cn = resultItems.get("title_cn"); title_en = resultItems.get("title_en"); organization = resultItems.get("organization"); abstract_cn = resultItems.get("abstract_cn"); abstract_en = resultItems.get("abstract_en"); proceedings_name = resultItems.get("proceedings_name"); conference_name = resultItems.get("conference_name"); date = resultItems.get("date"); place = resultItems.get("place"); code = resultItems.get("code"); organizor = resultItems.get("organizor"); foundation = resultItems.get("foundation"); authors_cn = resultItems.get("authors_cn"); authors_en = resultItems.get("authors_en"); CnkiConferencePaper cnkiConferencePaper = new CnkiConferencePaper(); String cnkiId = ""; String[] strs = url.split("filename"); cnkiId = strs[1].replaceAll("=",""); cnkiConferencePaper.setCnkiId(cnkiId); cnkiConferencePaper.setUrl(url); cnkiConferencePaper.setTitleCn(title_cn); cnkiConferencePaper.setTitleEn(title_en); cnkiConferencePaper.setOrganization(organization); cnkiConferencePaper.setAbstractCn(abstract_cn); cnkiConferencePaper.setAbstractEn(abstract_en); cnkiConferencePaper.setProceedingsName(proceedings_name); cnkiConferencePaper.setConferenceName(conference_name); cnkiConferencePaper.setDate(date); cnkiConferencePaper.setPlace(place); cnkiConferencePaper.setCode(code); cnkiConferencePaper.setAssociation(organizor); cnkiConferencePaper.setAuthorsCn(authors_cn); cnkiConferencePaper.setAuthorsEn(authors_en); cnkiConferencePaperRepository.save(cnkiConferencePaper); //保存数据 System.out.println("已完成"+ (count++)+"条" ); } }
package com.microbekb.crawler.cnki.jpa; import javax.persistence.Entity; import javax.persistence.Id; /** * Created by DELL on 2017/2/18. */
//实体类,对应数据库的表单
@Entity public class CnkiConferencePaper { @Id private String cnkiId; private String url; private String titleCn; private String titleEn; private String organization; private String abstractCn; private String abstractEn; private String proceedingsName; private String conferenceName; private String date; private String place; private String code; private String association; private String authorsCn; private String authorsEn; private String foundation; public String getCnkiId() { return cnkiId; } public void setCnkiId(String cnkiId) { this.cnkiId = cnkiId; } public String getFoundation() { return foundation; } public void setFoundation(String foundation) { this.foundation = foundation; } public String getUrl() { return url; } public String getAssociation() { return association; } public void setAssociation(String association) { this.association = association; } public String getAuthorsCn() { return authorsCn; } public void setAuthorsCn(String authorsCn) { this.authorsCn = authorsCn; } public String getAuthorsEn() { return authorsEn; } public void setAuthorsEn(String authorsEn) { this.authorsEn = authorsEn; } public void setUrl(String url) { this.url = url; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getPlace() { return place; } public void setPlace(String place) { this.place = place; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getTitleCn() { return titleCn; } public void setTitleCn(String titleCn) { this.titleCn = titleCn; } public String getTitleEn() { return titleEn; } public void setTitleEn(String titleEn) { this.titleEn = titleEn; } public String getOrganization() { return organization; } public void setOrganization(String organization) { this.organization = organization; } public String getAbstractCn() { return abstractCn; } public void setAbstractCn(String abstractCn) { this.abstractCn = abstractCn; } public String getAbstractEn() { return abstractEn; } public void setAbstractEn(String abstractEn) { this.abstractEn = abstractEn; } public String getProceedingsName() { return proceedingsName; } public void setProceedingsName(String proceedingsName) { this.proceedingsName = proceedingsName; } public String getConferenceName() { return conferenceName; } public void setConferenceName(String conferenceName) { this.conferenceName = conferenceName; } }