PageRank介绍
我的环境是MyEclipse7.0,jdk为1.7
PageRank算法包括两个类HtmlEntity和HtmlPageRank,其中HtmlPageRank需要用到htmlParser.jar和htmllexer.jar这两个包。本文是根据http://duyunfei.iteye.com/blog/1532798的说明调试的.
首先,我们准备了7个测试网页,这几个网页的链接情况如下:
i\j | test1 | test2 | test3 | test4 | test5 | test6 | test7 |
test1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
test2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
test3 | 0 | 0 | 0 | 1 | 1 | 1 | 0 |
test4 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
test5 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
test6 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
test7 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
表格的意思是 test1链接到test2,test3 ....依次类推,我们大致的根据上面两个原则可以猜一下,哪个将会是排名第一的网页?哪个最不重要?
貌似是test4和test6?
Html代码我都是放在E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc
Test1.html链接代码
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a>
Test2.html
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
Test3.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html">test6</a>
Test4.html
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>
Test5.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
Test6.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>
Test7.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>
运行结果:
HtmlPageRank类
import java.io.*;
import java.util.*;
import org.htmlparser.*;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.HtmlPage;
/**
*pagerank算法实现
*
*@authorafei
*
*/
publicclassHtmlPageRank{
/*阀值 */
publicstaticdoubleMAX=0.00000000001;
/*阻尼系数 */
publicstaticdoublealpha=0.85;
publicstaticStringhtmldoc="E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc";
publicstaticMap<String,HtmlEntity>map=newHashMap<String,HtmlEntity>();
publicstaticList<HtmlEntity>list=newArrayList<HtmlEntity>();
publicstaticdouble[]init;
publicstaticdouble[]pr;
publicstaticvoidmain(String[]args)throws Exception {
loadHtml();
pr=doPageRank();
while(!(checkMax())){
System.arraycopy(pr,0,init,0,init.length);
pr=doPageRank();
}
for(inti=0;i<pr.length;i++){
HtmlEntityhe=list.get(i);
he.setPr(pr[i]);
}
List<HtmlEntity>finalList=newArrayList<HtmlEntity>();
Collections.sort(list,newComparator(){
publicintcompare(Objecto1,Objecto2){
HtmlEntityh1=(HtmlEntity)o1;
HtmlEntityh2=(HtmlEntity)o2;
intem=0;
if(h1.getPr()>h2.getPr()){
em=-1;
}else{
em=1;
}
returnem;
}
});
for(HtmlEntityhe:list){
System.out.println(he.getPath()+" : "+he.getPr());
}
}
/**
*加载文件夹下的网页文件,并且初始化pr值(即init数组),计算每个网页的外链和内链
*/
publicstaticvoidloadHtml()throws Exception {
Filefile=newFile(htmldoc);
File[]htmlfiles=file.listFiles(newFileFilter(){
publicbooleanaccept(Filepathname){
if(pathname.getPath().endsWith(".html")){
return true;
}
return false;
}
});
init=newdouble[htmlfiles.length];
for(inti=0;i<htmlfiles.length;i++){
Filef=htmlfiles[i];
BufferedReaderbr=newBufferedReader(newInputStreamReader(
newFileInputStream(f)));
Stringline=br.readLine();
StringBufferhtml=newStringBuffer();
while(line!=null){
line=br.readLine();
html.append(line);
}
HtmlEntityhe=newHtmlEntity();
he.setPath(f.getAbsolutePath());
he.setContent(html.toString());
Parserparser=Parser.createParser(html.toString(),"gb2312");
HtmlPagepage=newHtmlPage(parser);
parser.visitAllNodesWith(page);
NodeListnodelist=page.getBody();
nodelist=nodelist.extractAllNodesThatMatch(
newTagNameFilter("A"),true);
for(intj=0;j<nodelist.size();j++){
LinkTagoutlink=(LinkTag)nodelist.elementAt(j);
he.getOutLinks().add(outlink.getAttribute("href"));
}
map.put(he.getPath(),he);
list.add(he);
init[i]=0.0;
}
for(inti=0;i<list.size();i++){
HtmlEntityhe=list.get(i);
List<String>outlink=he.getOutLinks();
for(Stringol:outlink){
HtmlEntityhe0=map.get(ol);
try{
he0.getInLinks().add(he.getPath());
}catch(NullPointerExceptione){
//如果网页的链接路径不正确,则报NullPointerException错误,并且你会发现heo=null,也就是说map.get(ol)取到的值为null,但是事实上map不为null,ol的值在map中不存在导致的,这是由于html中路径设置不正确
e.printStackTrace();
}
}
}
}
/**
*计算pagerank
*
*@paraminit
*@paramalpho
*@return
*/
privatestaticdouble[]doPageRank(){
double[]pr=newdouble[init.length];
for(inti=0;i<init.length;i++){
doubletemp=0;
HtmlEntityhe0=list.get(i);
for(intj=0;j<init.length;j++){
HtmlEntityhe=list.get(j);
//计算对本页面链接相关总值
if(i!=j&&he.getOutLinks().size()!=0&&he.getOutLinks().contains(he0.getPath())){
temp=temp+init[j]/he.getOutLinks().size();
}
}
//经典的pr公式
pr[i]=alpha+(1-alpha)*temp;
}
returnpr;
}
/**
*判断前后两次的pr数组之间的差别是否大于我们定义的阀值假如大于,那么返回false,继续迭代计算pr
*
*@parampr
*@paraminit
*@parammax
*@return
*/
privatestaticbooleancheckMax(){
booleanflag=true;
for(inti=0;i<pr.length;i++){
if(Math.abs(pr[i]-init[i])>MAX){
flag=false;
break;
}
}
return flag;
}
}
HtmlEntity类
import java.util.*;
/**
*网页entity
*
*@authorafei
*
*/
classHtmlEntity{
privateStringpath;
privateStringcontent;
/*外链(本页面链接的其他页面) */
privateList<String>outLinks=newArrayList<String>();
/*内链(另外页面链接本页面) */
privateList<String>inLinks=newArrayList<String>();
privatedoublepr;
publicStringgetPath(){
returnpath;
}
publicvoidsetPath(Stringpath){
this.path=path;
}
publicStringgetContent(){
returncontent;
}
publicvoidsetContent(Stringcontent){
this.content=content;
}
publicdoublegetPr(){
returnpr;
}
publicvoidsetPr(doublepr){
this.pr=pr;
}
publicList<String>getOutLinks(){
returnoutLinks;
}
publicvoidsetOutLinks(List<String>outLinks){
this.outLinks=outLinks;
}
publicList<String>getInLinks(){
returninLinks;
}
publicvoidsetInLinks(List<String>inLinks){
this.inLinks=inLinks;
}
}
运行结果
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html: 1.0988562616424633
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html: 1.024767124729736
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html: 1.0225108328175456
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html: 1.0012654834548864
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html: 0.994362279917484
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html: 0.9049428130819769
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html: 0.9000632741726616