WebMagic使用--爬取cnki会议

本文介绍了一个使用WebMagic框架爬取中国知网(CNKI)会议论文信息的示例项目。该爬虫能够抓取论文标题、作者、摘要等详细信息,并将数据存储到数据库中。
package com.microbekb.crawler.cnki;

import com.microbekb.crawler.cnki.jpa.CnkiSpacePaper;
import com.microbekb.crawler.cnki.jpa.CnkiSpacePaperRepository;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

/**
 * Created by DELL on 2017/2/17.
 */
@Component
public class CnkiConferencePaperCrawler implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(500);   //设置参数

    private static HashSet<String> urls=new HashSet<>();

    private static final org.slf4j.Logger log = LoggerFactory.getLogger(CnkiConferencePaperCrawler.class);  

    @Autowired
    private CnkiSpacePaperRepository cnkiSpacePaperRepository;  //用于与数据库关联,通过spring注入

    @Autowired
    private CnkiConferencePaperPipeline cnkiJournalPaperPipeline;   //用于处理数据,通过spring注入

    @Override
    public void process(Page page) {

        //需要爬取的字段
        String url= page.getUrl().toString();
        String title_cn= "";
        String title_en= "";
        String organization = "";
        String abstract_cn= "";
        String abstract_en= "";
        String proceedings_name= "";
        String conference_name= "";
        String date= "";
        String place= "";
        String code= "";
        String organizor= "";
        String foundation="";
        String authors_cn = "";
        String authors_en = "";

        //#chTitle
        title_cn = page.getHtml().xpath("//span[@id='chTitle']/text()").toString();

        title_en = page.getHtml().xpath("//span[@id='enTitle']/text()").toString();

        List<Selectable> pList = page.getHtml().xpath("//div[@class='summary']/p").nodes();
        for(Selectable p:pList){
            //System.out.println(p.toString());
            if(p.toString().contains("作者")){
                List<String> author_cns = new ArrayList<>();
                author_cns = p.xpath("//a[@class='KnowledgeNetLink']/text()").all();
                for(String str:author_cns){
                    authors_cn = authors_cn+str+";";
                }
            }
            if(p.toString().contains("机构")){
                List<String> organizations = new ArrayList<>();
                organizations = p.xpath("//a[@class='KnowledgeNetLink']/text()").all();
                for(String str:organizations){
                    organization = organization+str+";";
                }
            }
            if(p.toString().contains("摘要")){
                abstract_cn = p.xpath("//span/text()").toString();
            }

        }
        //#content > div:nth-child(1) > div:nth-child(5) > ul:nth-child(1) > li > a
        proceedings_name = page.getHtml().xpath("//div[@class='summary']/ul/li/a/text()").toString();

        List<String> itemStrList = page.getHtml().xpath("//div[@class='summary']/ul/li/text()").all();
        for(String str:itemStrList){
            if(str.contains("会议名称")){
                conference_name = str.replaceAll("【会议名称】","").trim();
            }
            if(str.contains("会议时间")){
                date = str.replaceAll("【会议时间】","").trim();
            }
            if(str.contains("会议地点")){
                place = str.replaceAll("【会议地点】","").trim();
            }
            if(str.contains("分类号")){
                code = str.replaceAll("【分类号】","").trim();
            }
            if(str.contains("主办单位")){
                organizor = str.replaceAll("【主办单位】","").trim();
            }
        }
        foundation = page.getHtml().xpath("//div[@class='summary']/div[@class='keywords']/text()").toString();

        page.putField("url",url);
        page.putField("title_cn",title_cn);
        page.putField("title_en",title_en);
        page.putField("organization",organization);
        page.putField("abstract_cn",abstract_cn);
        page.putField("abstract_en",abstract_en);
        page.putField("proceedings_name",proceedings_name);
        page.putField("conference_name",conference_name);
        page.putField("date",date);
        page.putField("place",place);
        page.putField("code",code);
        page.putField("organizor",organizor);
        page.putField("foundation",foundation);
        page.putField("authors_cn",authors_cn);
        page.putField("authors_en",authors_en);

    }

    @Override
    public Site getSite() {
        return site;
    }

//测试url,初始url地址,可根据需要修改
private void addUrlTest(){ urls.add("http://www.cnki.net/KCMS/detail/detail.aspx?dbcode=CPFD&filename=KAXH201012001028"); }

//调用此方法,为爬虫入口
public void start(){ //CnkiJournalPaperCrawler cnkiJournalPaperCrawler = new CnkiJournalPaperCrawler(); addTest(); //addUrlText(); Spider spider = Spider.create(this); for(String url:urls){ spider = spider.addUrl(url); } try{ spider.thread(1).addPipeline(cnkiJournalPaperPipeline).run(); }catch (Exception e){ }
 }
package com.microbekb.crawler.cnki;

import com.microbekb.crawler.cnki.jpa.CnkiConferencePaper;
import com.microbekb.crawler.cnki.jpa.CnkiConferencePaperRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 * Created by DELL on 2017/2/17.
 */
@Component
@ComponentScan
public class CnkiConferencePaperPipeline implements Pipeline {

    public static int count=0;

    @Autowired
    CnkiConferencePaperRepository cnkiConferencePaperRepository;  //用于存储数据,通过spring注入

    @Override
    public void process(ResultItems resultItems, Task task) {

        String url= "";
        String title_cn= "";
        String title_en= "";
        String organization = "";
        String abstract_cn= "";
        String abstract_en= "";
        String proceedings_name= "";
        String conference_name= "";
        String date= "";
        String place= "";
        String code= "";
        String organizor= "";
        String foundation="";
        String authors_cn = "";
        String authors_en = "";

        url = resultItems.get("url");
        title_cn = resultItems.get("title_cn");
        title_en = resultItems.get("title_en");
        organization = resultItems.get("organization");
        abstract_cn = resultItems.get("abstract_cn");
        abstract_en = resultItems.get("abstract_en");
        proceedings_name = resultItems.get("proceedings_name");
        conference_name = resultItems.get("conference_name");
        date = resultItems.get("date");
        place = resultItems.get("place");
        code = resultItems.get("code");
        organizor = resultItems.get("organizor");
        foundation = resultItems.get("foundation");
        authors_cn = resultItems.get("authors_cn");
        authors_en = resultItems.get("authors_en");

        CnkiConferencePaper cnkiConferencePaper = new CnkiConferencePaper();
        String cnkiId = "";
        String[] strs = url.split("filename");
        cnkiId = strs[1].replaceAll("=","");

        cnkiConferencePaper.setCnkiId(cnkiId);
        cnkiConferencePaper.setUrl(url);
        cnkiConferencePaper.setTitleCn(title_cn);
        cnkiConferencePaper.setTitleEn(title_en);
        cnkiConferencePaper.setOrganization(organization);
        cnkiConferencePaper.setAbstractCn(abstract_cn);
        cnkiConferencePaper.setAbstractEn(abstract_en);
        cnkiConferencePaper.setProceedingsName(proceedings_name);
        cnkiConferencePaper.setConferenceName(conference_name);
        cnkiConferencePaper.setDate(date);
        cnkiConferencePaper.setPlace(place);
        cnkiConferencePaper.setCode(code);
        cnkiConferencePaper.setAssociation(organizor);
        cnkiConferencePaper.setAuthorsCn(authors_cn);
        cnkiConferencePaper.setAuthorsEn(authors_en);
        cnkiConferencePaperRepository.save(cnkiConferencePaper);    //保存数据
        System.out.println("已完成"+ (count++)+"条" );
    }
}
package com.microbekb.crawler.cnki.jpa;

import javax.persistence.Entity;
import javax.persistence.Id;

/**
 * Created by DELL on 2017/2/18.
 */
//实体类,对应数据库的表单
@Entity public class CnkiConferencePaper { @Id private String cnkiId; private String url; private String titleCn; private String titleEn; private String organization; private String abstractCn; private String abstractEn; private String proceedingsName; private String conferenceName; private String date; private String place; private String code; private String association; private String authorsCn; private String authorsEn; private String foundation; public String getCnkiId() { return cnkiId; } public void setCnkiId(String cnkiId) { this.cnkiId = cnkiId; } public String getFoundation() { return foundation; } public void setFoundation(String foundation) { this.foundation = foundation; } public String getUrl() { return url; } public String getAssociation() { return association; } public void setAssociation(String association) { this.association = association; } public String getAuthorsCn() { return authorsCn; } public void setAuthorsCn(String authorsCn) { this.authorsCn = authorsCn; } public String getAuthorsEn() { return authorsEn; } public void setAuthorsEn(String authorsEn) { this.authorsEn = authorsEn; } public void setUrl(String url) { this.url = url; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } public String getPlace() { return place; } public void setPlace(String place) { this.place = place; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } public String getTitleCn() { return titleCn; } public void setTitleCn(String titleCn) { this.titleCn = titleCn; } public String getTitleEn() { return titleEn; } public void setTitleEn(String titleEn) { this.titleEn = titleEn; } public String getOrganization() { return organization; } public void setOrganization(String organization) { this.organization = organization; } public String getAbstractCn() { return abstractCn; } public void setAbstractCn(String abstractCn) { this.abstractCn = abstractCn; } public String getAbstractEn() { return abstractEn; } public void setAbstractEn(String abstractEn) { this.abstractEn = abstractEn; } public String getProceedingsName() { return proceedingsName; } public void setProceedingsName(String proceedingsName) { this.proceedingsName = proceedingsName; } public String getConferenceName() { return conferenceName; } public void setConferenceName(String conferenceName) { this.conferenceName = conferenceName; } }

 

posted on 2017-03-17 19:33 hykd 阅读( ...) 评论( ...) 编辑 收藏

转载于:https://www.cnblogs.com/yzwhykd/p/6568022.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值