如果基础爬虫都没有明白的话,那么请看我的之前的博客吧——关于爬虫入门的一些心得(一)
虽然我们已经可以访问到一些网站了,但是有的直接就给我们pass掉了,所以我们要伪装自己
//以http协议方式打开网络流
conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(3000);//与服务器连接的时间
conn.setReadTimeout(3000);//从服务器读取数据的时间
//伪装自己假装是浏览器访问
conn.setRequestMethod("GET");
conn.setRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
conn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
//正式获取连接
in = conn.getInputStream();
最后贴一下原码。
public class CrawlTest02 {
public static void main(String[] args) {
URL url = null;
HttpURLConnection conn = null;
InputStream in = null;
InputStreamReader isr = null;
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
try {
url = new URL("http://www.dianping.com/");
//以http协议方式打开网络流
conn = (HttpURLConnection) url.openConnection();
conn.setConnectTimeout(3000);//与服务器连接的时间
conn.setReadTimeout(3000);//从服务器读取数据的时间
//伪装自己假装是浏览器访问
conn.setRequestMethod("GET");
conn.setRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
conn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
//正式获取连接
in = conn.getInputStream();
isr = new InputStreamReader(in, "UTF-8");//网站的编码个格式
br = new BufferedReader(isr);
String line = null;
while((line = br.readLine()) != null) {
sb.append(line).append("\r\n");
}
//打印一下
System.out.println(sb.toString());
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭流
if(br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(isr != null) {
try {
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(in != null) {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}