本篇博客撰写说明:
①时代在变,楼主的需求也发生了一丁点的更新,从入围《优快云 2012 博客之星》评选,楼主幸运挤进前20名。但是与第10名票数还有一定的差距,故更新程序,查看楼主自己与第十名的 票数差距、排名差距
②有优快云的朋友对前几天楼主写的《自己写的实时爬取 优快云 2012 博客之星 88位候选人排名》程序有点兴趣,故将程序 进行优化和重构,并加入了相应的注释,使程序更加具有可读性。
末:由于楼主能力有限,原先发现的该程序爬行88个网页速度过慢,主因系:网速原因,故不再优化。楼主也发现解析各个网页中【用户名、票数、排名】部分有很大的优化空间,如感兴趣的网友,请提供解析部分的优化方案,共同学习哦,亲!
如果觉得我的技术文章还有点让列为看官汲取之处,
请给我投上宝贵的一篇,以兹鼓励呵,多谢,多谢!!
本人ID:m13666368773
投票地址:http://vote.blog.youkuaiyun.com/item/blogstar/m13666368773
凡投票的朋友,
请第一时间在文章下方评论:“当前票数:XXX+已投票+邮箱:XXX@XXX.com”
稍后会将 <Web应用界面设计规范>PPT版本,发给您。
该博客地址:http://blog.youkuaiyun.com/m13666368773/article/details/8276810
请稍花点时间,为我投上您手中宝贵的一票,
敬告:我这能看到您的投票“用户名”,请勿虚报!多谢,多谢!!
截至时间:2012-12-30
废话不多说:上代码
package com.aptech;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@SuppressWarnings("unchecked")
public class TestPachongUrl {
private static Map messageMap = new HashMap();
private static List list = new ArrayList();
private static String url = "http://vote.blog.youkuaiyun.com/item/blogstar/";//抽取公共Url部分
/*
* 以下user[],手工录入2012年88位优快云博客之星候选人
*/
private static String user[] = new String[] { "Testing_is_believing", "t0nsha", "iukey", "yjflinchong", "taomanman", "chinafe", "hliq5399", "dog250", "qinjuning", "cheny_com", "v_JULY_v",
"zhmxy555", "Purpleendurer", "iihero", "yming0221", "ccanan", "tigerjb", "cheungmine", "hawksoft", "sheismylife", "hfahe", "cyq1984", "littletigerat", "kmyhy", "caimouse", "manoel",
"xyz_lmn", "hunkcai", "yiyaaixuexi", "norains", "clever101", "leftfist", "xiaominghimi", "niyi0318", "yanghuiliu", "abandonship", "mapdigit", "bill_man", "Augusdi", "LoveLion",
"sunboy_2050", "kongxx", "21aspnet", "chszs", "thl789", "mylxiaoyi", "akof1314", "yincheng01", "keyboardOTA", "pan_tian", "downmoon", "wangkuifeng0118", "robinson_0612", "bluishglc",
"coolbacon", "tangcheng_ok", "tianxiaode", "cjjky", "MoreWindows", "mr_raptor", "dojotoolkit", "chelsea", "chgaowei", "teamlet", "IBM_hoojo", "iefreer", "lee576", "jaminwm", "xuhuojun",
"linghe301", "caolaosanahnu", "ricohzhanglong", "totogo2010", "axman", "ce123", "rabbit729", "nkmnkm", "superdont", "m13666368773", "aomandeshangxiao", "hitlion2008", "siren0203",
"feixiaoxing", "Poechant", "cloudhsu", "Innost", "yanghua_kobe", "tianlesoftware" };
private static final String master = "m13666368773";// 楼主用户名,[关键值],用于从集合中获取楼主信息,包括用户名、当前票数、当前排名
private static final String tenthUser = "10";// 第十名,[关键值],用户从集合中获取第十名用户的信息,包括用户名、当前票数、当前排名
private static String saveMasterMessage = null;// 初始化,用于保存楼主信息
private static String saveTenthUserMessage = null;// 初始化,用于保存第十名用户的信息
/**
* 该方法用于爬取88名候选人投票主页,并记录信息:用户名、当前票数、当前排名
* @param url
*/
public static String test(URL url) throws Exception {
/**
* 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using
* java.net.URL and //java.net.URLConnection
*/
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
/**
* 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。
* 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做:
*/
connection.setDoOutput(true);
connection.setRequestMethod("POST");
connection.setRequestProperty("user-agent", "mozilla/4.7 [en] (win98; i)");
connection.connect();
/**
* 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ...
*/
OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");
out.flush();
out.close();
/**
* 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT:
* text/plain Content-type: application/x-www-form-urlencoded
* Content-length: 99 username=bob password=someword
*/
// 一旦发送成功,用以下方法就可以得到服务器的回应:
String sCurrentLine = "";
String sTotalString = "";
InputStream l_urlStream;
l_urlStream = connection.getInputStream();
// 传说中的三层包装阿!
BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));
while ((sCurrentLine = l_reader.readLine()) != null) {
sTotalString += sCurrentLine + "\r\n";
}
int begin0 = sTotalString.indexOf("博客地址:<a href=\"http://blog.youkuaiyun.com/");
int end0 = sTotalString.indexOf("\" class=\"red\" target=\"_blank\">");
int begin1 = sTotalString.indexOf("票数:<span class=\"red\">");
int end1 = sTotalString.indexOf("</span> 票</li>");
int begin2 = sTotalString.indexOf("当前排名:<span class=\"red\">");
int end2 = sTotalString.indexOf("</span> 名</li>");
String message = sTotalString.substring(begin0 + 35, end0) + "-" + sTotalString.substring(begin1 + 21, end1) + "=" + sTotalString.substring(begin2 + 23, end2);
return message;
}
/**
* 给用户名补充空格,用于显示对齐
* @param user
*/
public static String addBlank(String user) {
String blank = " ";
int userLength = user.length();
for (int i = 0; i < 30 - userLength; i++) {
user += blank;
}
return user;
}
/**
* 给表头补充空格,用于显示对齐
* @param message
*/
public static String addChinaBlank(String message) {
String blank = " ";
int userLength = message.length() * 2;
for (int i = 0; i < 70 - userLength; i++) {
message += blank;
}
return message;
}
/**
* 输入 一条用户信息,通过本方法,分别解析出 用户名、当前票数、当前排名,并做对齐处理,返回
* @param message
*/
public static String getRankMessage(String message) {
return addBlank(message.substring(0, message.indexOf("-"))) + message.substring(message.indexOf("-") + 1, message.indexOf("=")) + " "
+ message.substring(message.indexOf("=") + 1, message.length());
}
/**
* 主方法,运行一下喽
*/
public static void main(String[] args) throws Exception {
for (int i = 0; i < user.length; i++) {
list.add(new URL(url + user[i]));
}
SimpleDateFormat dateformat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒 E ");
String nowTime = dateformat.format(new Date());
System.out.println("统计时间:" + nowTime);
System.out.println("候选人数量:" + user.length);
System.out.println(addChinaBlank("用户名") + addChinaBlank("票数") + "排名");
for (int i = 0; i < list.size(); i++) {
String subMessage = test((URL) list.get(i));
String key = subMessage.substring(subMessage.indexOf("=") + 1, subMessage.length());
messageMap.put(key, subMessage);
}
for (int i = 1; i <= 88; i++) {
String endMessage = messageMap.get("" + i).toString();
System.out.println(getRankMessage(endMessage));
if (master.equals(endMessage.substring(0, endMessage.indexOf("-")))) {// 保存楼主信息
saveMasterMessage = endMessage;
}
if (tenthUser.equals(endMessage.substring(endMessage.indexOf("=") + 1, endMessage.length()))) {// 保存第十名用户的信息
saveTenthUserMessage = endMessage;
}
}
int tenthUserPiaoshu = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("-") + 1, saveTenthUserMessage.indexOf("=")));
int masterPiaoshu = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("-") + 1, saveMasterMessage.indexOf("=")));
int piaoshuGap = tenthUserPiaoshu - masterPiaoshu;// 楼主与第十名相差的票数
int tenthUserPaiming = Integer.parseInt(saveTenthUserMessage.substring(saveTenthUserMessage.indexOf("=") + 1, saveTenthUserMessage.length()));
int masterPaiming = Integer.parseInt(saveMasterMessage.substring(saveMasterMessage.indexOf("=") + 1, saveMasterMessage.length()));
int paimingGap = ~(tenthUserPaiming - masterPaiming) + 1;// 楼主与第十名相差的名数
System.out.println("=============以下对比楼主与第十名用户的信息===============================");
System.out.println(getRankMessage(saveTenthUserMessage));
System.out.println(getRankMessage(saveMasterMessage));
System.out.println("========================================================================");
System.out.println(addBlank("difference tenthUer VS master") + piaoshuGap + " " + paimingGap);
}
}
运行一下:
统计时间:2012年12月19日 17时16分34秒 星期三
候选人数量:88
用户名 票数 排名
v_JULY_v 1347 1
MoreWindows 583 2
yiyaaixuexi 476 3
mr_raptor 435 4
xiaominghimi 410 5
yincheng01 395 6
zhmxy555 391 7
yming0221 379 8
Poechant 358 9
ricohzhanglong 346 10
LoveLion 322 11
tianlesoftware 286 12
taomanman 282 13
m13666368773 217 14
aomandeshangxiao 216 15
cheny_com 176 16
linghe301 160 17
dojotoolkit 149 18
hawksoft 141 19
cjjky 123 20
akof1314 122 21
nkmnkm 120 22
clever101 116 23
yanghuiliu 103 24
cyq1984 103 25
niyi0318 101 26
sheismylife 96 27
cloudhsu 87 28
coolbacon 76 29
Testing_is_believing 71 30
cheungmine 56 31
bill_man 55 32
tangcheng_ok 55 33
21aspnet 53 34
lee576 53 35
norains 51 36
teamlet 50 37
manoel 48 38
hfahe 48 39
sunboy_2050 47 40
yjflinchong 47 41
tigerjb 43 42
mapdigit 43 43
axman 42 44
Augusdi 39 45
pan_tian 39 46
feixiaoxing 38 47
mylxiaoyi 37 48
t0nsha 35 49
thl789 35 50
qinjuning 35 51
kongxx 34 52
caimouse 32 53
chgaowei 32 54
dog250 31 55
ce123 31 56
downmoon 30 57
xyz_lmn 29 58
littletigerat 28 59
robinson_0612 28 60
iihero 28 61
siren0203 28 62
Purpleendurer 28 63
iukey 27 64
tianxiaode 27 65
abandonship 27 66
Innost 27 67
wangkuifeng0118 26 68
iefreer 26 69
caolaosanahnu 26 70
hunkcai 25 71
chelsea 25 72
totogo2010 24 73
leftfist 24 74
IBM_hoojo 24 75
hitlion2008 24 76
jaminwm 23 77
rabbit729 23 78
yanghua_kobe 23 79
keyboardOTA 22 80
ccanan 20 81
hliq5399 20 82
kmyhy 20 83
superdont 19 84
xuhuojun 19 85
chszs 18 86
chinafe 17 87
bluishglc 14 88
=============以下对比楼主与第十名用户的信息===============================
ricohzhanglong 346 10
m13666368773 217 14
========================================================================
difference tenthUer VS master 129 4