zz:
目的: http://www.iteye.com/topic/638206
使用HTTPClient4.0.1登录到人人网,并从特定的网页抓取数 据。
总结&注意 事项:
- HttpClient(DefaultHttpClient)代表了一个会话,在同一个会话中,HttpClient对cookie自动进行管理(当然, 也可以在程序中进行控制)。
- 在同一个会话中,当使用post或是get发起一个新的请求时,一般需要对调用前一个会话的abort()方法,否则会抛出异常。
- 有些网站登录成功后会重定向(302, 303),比如这里的人人网。如果发出的是post请求,需要从响应头中取出location,并再次向网站发送请求,以获取最终数据。
- 抓取程序不要运行地过于频繁,大部分站点都有抵制刷网站机制。人人网访问过于频繁会锁账号。
- 使用录制工具录制出登录时向网站发出的请求参数。在这里,我使用了badboy,导出成jmeter文件,在jmeter中就可以看到登录时向网站发送的 参数列表和相应的值。
- 人人网属于登陆流程比较简单的网站,后一篇会介绍一家比较难搞的网站。
代码:
- public class RenRen {
- // The configuration items
- private static String userName = "YourMailinRenren" ;
- private static String password = "YourPassword" ;
- private static String redirectURL = "http://blog.renren.com/blog/304317577/449470467 " ;
- // Don't change the following URL
- private static String renRenLoginURL = "http://www.renren.com/PLogin.do " ;
- // The HttpClient is used in one session
- private HttpResponse response;
- private DefaultHttpClient httpclient = new DefaultHttpClient();
- private boolean login() {
- HttpPost httpost = new HttpPost(renRenLoginURL);
- // All the parameters post to the web site
- List<NameValuePair> nvps = new ArrayList<NameValuePair>();
- nvps.add(new BasicNameValuePair( "origURL" , redirectURL));
- nvps.add(new BasicNameValuePair( "domain" , "renren.com" ));
- nvps.add(new BasicNameValuePair( "isplogin" , "true" ));
- nvps.add(new BasicNameValuePair( "formName" , "" ));
- nvps.add(new BasicNameValuePair( "method" , "" ));
- nvps.add(new BasicNameValuePair( "submit" , "登录" ));
- nvps.add(new BasicNameValuePair( "email" , userName));
- nvps.add(new BasicNameValuePair( "password" , password));
- try {
- httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
- response = httpclient.execute(httpost);
- } catch (Exception e) {
- e.printStackTrace();
- return false ;
- } finally {
- httpost.abort();
- }
- return true ;
- }
- private String getRedirectLocation() {
- Header locationHeader = response.getFirstHeader("Location" );
- if (locationHeader == null ) {
- return null ;
- }
- return locationHeader.getValue();
- }
- private String getText(String redirectLocation) {
- HttpGet httpget = new HttpGet(redirectLocation);
- // Create a response handler
- ResponseHandler<String> responseHandler = new BasicResponseHandler();
- String responseBody = "" ;
- try {
- responseBody = httpclient.execute(httpget, responseHandler);
- } catch (Exception e) {
- e.printStackTrace();
- responseBody = null ;
- } finally {
- httpget.abort();
- httpclient.getConnectionManager().shutdown();
- }
- return responseBody;
- }
- public void printText() {
- if (login()) {
- String redirectLocation = getRedirectLocation();
- if (redirectLocation != null ) {
- System.out.println(getText(redirectLocation));
- }
- }
- }
- public static void main(String[] args) {
- RenRen renRen = new RenRen();
- renRen.printText();
- }
- }