HttpClient模拟登陆并获取指定页面的内容

最新推荐文章于 2020-04-09 14:05:28 发布

原创最新推荐文章于 2020-04-09 14:05:28 发布 · 434 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#java

HttpClient 专栏收录该内容

0 篇文章

订阅专栏

本文详细记录了使用HttpClient进行网站内容采集的方法，包括登录、获取指定路径页面内容及保存为.htm文件的操作。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

前段时间工作上需要采集别的网站上的内容，然后再自己系统中展示数据。经过自己找资料最后选用了HttpClient。以下为自己摸索的方法记录下来以便以后使用：

1、模拟登陆


private static final Logger logger = Logger.getLogger(IpManagerRobot.class);

private DefaultHttpClient httpClient = new DefaultHttpClient();

private boolean loging = false;

public boolean login() throws Exception{

                //需要登陆的地址，这里直接用登陆网站的form表单中的action属性。有的则可以直接使用...../login.jsp
		String loginForm = "http://124.238.214.79/platform/j_spring_security_check";

//		notifyMethod(loginForm);

		HttpPost httpPost = new HttpPost(loginForm);

                //设置请求头，httpwatch可以跟踪到
		httpPost.setHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; WOW64; Trident/4.0; znwb6600; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30618)");
		httpPost.setHeader("Referer", "http://124.238.214.79/platform/");
		httpPost.setHeader("Content-Type", "application/x-www-form-urlencoded");

		//登陆form表单需要提交的参数
		List<NameValuePair> params = new ArrayList<NameValuePair>();	
		params.add(new BasicNameValuePair("j_username", "stc2012"));
		params.add(new BasicNameValuePair("j_password", "q1w2e3r4"));
		params.add(new BasicNameValuePair("domain","124.238.214.79"));
		params.add(new BasicNameValuePair("url","/platform/"));
		params.add(new BasicNameValuePair("expiry",""));

		//设置编码
		try {
			httpPost.setEntity(new UrlEncodedFormEntity(params,HTTP.UTF_8));
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}

		HttpResponse res = 	postMethod(httpPost);

		if(res.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_PERMANENTLY || 
				res.getStatusLine().getStatusCode() == HttpStatus.SC_MOVED_TEMPORARILY ||
				res.getStatusLine().getStatusCode() == HttpStatus.SC_SEE_OTHER || 
				res.getStatusLine().getStatusCode() == HttpStatus.SC_TEMPORARY_REDIRECT){

			logger.info("登陆成功");
			loging = true;
		}else{
			logger.info("登陆失败");
			loging = false;
		}

		return loging;
	}


public HttpResponse postMethod(HttpPost post) {
		HttpResponse resp = null;
		try {
			resp = httpClient.execute(post);
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			post.abort();
		}
		return resp;
	}

2、登陆了以后就可以获取指定路径的页面内容了


public String get(String url) throws Exception{

//		String url = "http://124.238.214.79/platform/pages/getWssHistory.action?startDate="+startDate+"&endDate="+endDate+"&pageContext.currentpage=1";
		HttpGet get = new HttpGet(url);
		try {
			HttpResponse response = httpClient.execute(get);
			String responseString = "";
			if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK){
				HttpEntity entity = response.getEntity();

				BufferedReader br = new BufferedReader(new InputStreamReader(entity.getContent(), "UTF-8"));
				responseString = IOUtils.toString(br);
			}
			return responseString;

		} finally{
			get.abort();
		}

	}

另一种方法获取指定路径页面的内容


public String getText2(String url) throws Exception{

		HttpGet httpget = new HttpGet(url);
		try {

		    //创建HttpGet对象实例(get method实现)
//			String url = "http://124.238.214.79/platform/pages/getWssHistory.action?startDate="+startDate+"&endDate="+endDate+"&pageContext.currentpage=1";

		    // 创建Response Handler，这里使用了其自带的BasicResponseHandler
		    ResponseHandler<String> responseHandler = new BasicResponseHandler();
		    String responseBody = httpClient.execute(httpget, responseHandler);
		    //编码转换，解决中文乱码问题
		    String responseString = new String(responseBody.getBytes("ISO-8859-1"),"UTF-8");
		    //返回网页的响应结果
		    return responseString;
		} finally {
		    //代码执行完毕后，释放HttpClient占用的资源
//			httpClient.getConnectionManager().shutdown();
			httpget.abort();
		}
	}

保存指定路径页面为.htm文件


public void saveCollectionPage() throws Exception{

		//这里也可以直接使用httpGet的绝对地址，当然如果不是具体地址不要忘记
		HttpGet httpGet = new HttpGet("http://124.238.214.79/platform/pages/getWssHistory.action?startDate=2013-04-07&endDate=2013-05-07&pageContext.currentpage=1");
		HttpResponse response = httpClient.execute(httpGet);
		if(HttpStatus.SC_OK == response.getStatusLine().getStatusCode()){
			//请求成功，取得请求内容
			HttpEntity entity = response.getEntity();
			if(entity != null){

				File storeFile = new File("d:/tt.htm");
				FileOutputStream output = new FileOutputStream(storeFile);
				InputStream input = entity.getContent();
				byte b[] = new byte[1024];
				int j = 0;
				while ((j = input.read(b)) != -1) {
					output.write(b, 0, j);
				}
				output.flush();
				output.close();

			}
			if(entity != null){
				entity.consumeContent();
			}
		}
	}

读取htm文件


public String readTextFile(String fileName,String encode){
		StringBuffer str = new StringBuffer();
		try {
			File file = new File(fileName);
			InputStreamReader read = new InputStreamReader(new FileInputStream(file),encode);
			BufferedReader in = new BufferedReader(read);
			String dataLine = "";
			while (null != (dataLine = in.readLine())) {
				str.append(dataLine);
			}
			in.close();
		} catch (Exception e) {
			logger.info("------------->文件读取失败！");
		}
		return str.toString();
	}

</div>