使用httpClient保存网页至本地

最新推荐文章于 2021-11-19 14:55:34 发布

原创最新推荐文章于 2021-11-19 14:55:34 发布 · 1.8k 阅读

1 ·

CC 4.0 BY-SA版权

javase 专栏收录该内容

34 篇文章

订阅专栏

本文提供了一个使用Java实现的简单网页爬虫示例代码，该爬虫利用Apache HttpClient库来抓取网页并保存到本地。通过HttpGet请求，判断响应状态码是否为200来确定是否成功获取网页，然后将内容输出到一个以URL命名的.html文件中。

package com.gewb;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

public class Spider {
	private static HttpClient httpClient = new HttpClient();
	
	/**
	  * @param path
	  *            目标网页的链接
	  * @return 返回布尔值，表示是否正常下载目标页面
	  * @throws Exception
	  *             读取网页流或写入本地文件流的IO异常
	  */
	public static boolean downloadPage(String path) throws Exception {
		// 定义输入输出流
		  InputStream input = null;
		  OutputStream output = null;
		  
		// 得到 post 方法
		  GetMethod getMethod = new GetMethod(path);
		  
		// 执行，返回状态码
		  int statusCode = httpClient.executeMethod(getMethod);
		  
		// 针对状态码进行处理
		  // 简单起见，只处理返回值为 200 的状态码
		  if (statusCode == HttpStatus.SC_OK) {
			  input = getMethod.getResponseBodyAsStream();
			// 通过对URL的得到文件名
			   String filename = path.substring(path.lastIndexOf('/') + 1)
			     + ".html";
			   
			// 获得文件输出流
			   output = new FileOutputStream(filename);
			   
			// 输出到文件
			   int len = 0;
			   byte[] b = new byte[1024];
			   while ((len = input.read(b)) != -1) {
			    output.write(b, 0, len);
			   }
			   
			// 关闭输入流
			   if (input != null) {
			    input.close();
			   }
			   // 关闭输出流
			   if (output != null) {
			    output.close();
			   }
			   
			   System.out.println("成功");
			   return true;
		  }
		  System.out.println("失败");
		return false;
	}
	

	public static void main(String[] args) {
		try {
			   // 抓取百度首页，输出
			   Spider.downloadPage("http://www.baidu.com");
			
			  } catch (Exception e) {
			   e.printStackTrace();
			  }
		
		
	}

}