思路:
首先爬取一个用户的个人信息,然后根据该用户的好友关系去爬取好友信息,依次类推,爬取所有用户。
根据获取的用户id,访问blog主页获取个签名。
package com.cuihs.mySpider;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
public class CsdnUserPageProcessor implements PageProcessor{
private Site site = Site
.me()
.setRetryTimes(3)
.setSleepTime(1000);
@Override
public void process(Page page) {
if(page.getUrl().regex("http://my.youkuaiyun.com/\\w+").match()){
Elements mainElements = page.getHtml().getDocument()
.getElementsByTag("div").get(1).children();
Elements relationElements = mainElements.get(2).getElementsByTag("div");
String html = relationElements.get(0).html();
Elements skillElements = mainElements.get(1).getElementsByTag("div");
String id_user = getLastSlantContent(skillElements.get(0)
.getElementsByTag("a")
.get(0)
.attr("href"));
List<String> all = new Html(html).xpath("//div[@class='mod_relations']")
.links()
.all();
page.addTargetRequests(all);
page.addTargetRequest(new Request("http://blog.youkuaiyun.com/"+id_user));
page.setSkip(true);
}else if(page.getUrl().regex("http://blog.youkuaiyun.com").match()){
Object word = page.getHtml().xpath("//div[@id=blog_title]/h3/allText()");
if(word instanceof PlainText&&((PlainText)word).all().size()<1){
word = page.getHtml().xpath("//div[@class='person-sign']/span/allText()");
}
if(word instanceof PlainText&&((PlainText)word).all().size()>0&&!((PlainText)word).all().get(0).isEmpty()){
page.putField("左右铭", word);
}else{
page.setSkip(true);}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args){
Spider.create(new CsdnUserPageProcessor())
.addUrl("http://my.youkuaiyun.com/CHS007chs")
.thread(10)
.addPipeline(new FilePipeline("D:\\webmagic\\"))
.run();
}
//获取最后“/”后面的内容
public static String getLastSlantContent(String fullPath){
int pos = fullPath.lastIndexOf("/");
if(pos!=-1){
return fullPath.substring(pos+1);
}else{
return null;
}
}
}
最后欢迎大家访问我的个人网站:1024s