<script type="text/javascript"><!--
google_ad_client = "pub-0600375531528527";
google_ad_width = 120;
google_ad_height = 60;
google_ad_format = "120x60_as_rimg";
google_cpa_choice = "CAAQy8L8zwEaCDUSP1b6Y2DoKL-_93M";
//--></script>
<script type="text/javascript" src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
/**
* 一个简单查询,获取各大搜索引擎的查询结果
*这个也许对于研究者有点用处,譬如我们可以通过google或百度返回结果抽取一些我们想要的信息,
也可以用词方法盗取一些网上数据库,词典什么的。但还有一点需要我们做的就是从结果网页中解析出我们想要的内容,解析可以使用HtmlParser这个包或NekoHtml 都可以,不过分析格式也是挺烦人的事。后面我们对各大搜索引擎结果分析方法写出来,如果有人写好请告诉我!
*/
package irlab.opense;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.*;
/**
* @author yezheng
*
*/
public class opense {
private static int baidu = 0;
private static int google = 1;
/**
* start :起始页码
*query :查询词,可直接输入各大搜索引擎的复杂查询语法格式
*/
public static String getUrl(String query, int seid, int start) {
try{
if(seid == 0){
String q =null;// query.replace(" ","+");
q = URLEncoder.encode(query);
return "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd="+q+"&pn="+10*start;
}else if(seid ==1){
//return "http://www.google.cn/search?q=" + URLEncoder.encode(query, "utf8") + "&complete=1&hl=zh-CN&newwindow=1&start=" + 10*start +"&sa=N";
return "http://www.google.cn/search?q=" + URLEncoder.encode(query, "utf8") + "&complete=1&hl=zh-CN&lr=lang_zh-CN%7Clang_zh-TW&newwindow=1&start=" +10*start +"&sa=N";
}
}catch(Exception e){
}
return null;
}
public static byte[] getContents(String query, int seid, int start) {
InputStream urlStream = null;
String surl = getUrl(query ,seid, start);
System.out.println(surl);
URL url;
try {
url = new URL(surl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
long startTime = System.currentTimeMillis();
conn.connect();
urlStream = conn.getInputStream();
String resp = conn.getResponseMessage();
String respStr = conn.getHeaderField(0);
for (int i = 1;; i++) {
String key = conn.getHeaderFieldKey(i);
if (key == null) {
break;
}
String value = conn.getHeaderField(key);
}
BufferedInputStream remoteBIS = new BufferedInputStream(conn
.getInputStream());
ByteArrayOutputStream baos = new ByteArrayOutputStream(10240);
byte[] buf = new byte[1024];
int bytesRead = 0;
//把从remoteBIS读出得数据写到输出流baos中
while (bytesRead >= 0) {
baos.write(buf, 0, bytesRead);
bytesRead = remoteBIS.read(buf);
}
byte[] content = baos.toByteArray();
long timeTaken = System.currentTimeMillis() - startTime;
if (timeTaken < 100)
timeTaken = 500;
//计算网速
//int bytesPerSec = (int) ((double) content.length / ((double) timeTaken / 1000.0));
return baos.toByteArray();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
public static void save(String query,String dir){
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
byte[] b = getContents("mad cow disease",1, 0);
}
}