package com.xinhuanet.cloudDesk.controller;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.htmlparser.Parser;
import org.htmlparser.visitors.HtmlPage;
public class R {
public static void main(String[] args) throws Exception {
HttpClient httpClient = new HttpClient();
httpClient.getHostConfiguration().setProxy("202.84.17.41", 8080);
HttpConnectionManager httpConnManager = httpClient
.getHttpConnectionManager();
if (httpConnManager != null) {
HttpConnectionManagerParams mgrParams = new HttpConnectionManagerParams();
mgrParams.setSoTimeout(20000000);
mgrParams.setTcpNoDelay(true);
mgrParams.setConnectionTimeout(20000000);
mgrParams.setLinger(0);
mgrParams.setStaleCheckingEnabled(false);
httpConnManager.setParams(mgrParams);
}
String url = "http://www.poetry4cn.com";
GetMethod methodGet = new GetMethod(url);
httpClient.executeMethod(methodGet);
String charset = getCharSet(new String(methodGet.getResponseBody()));
System.out.println("getCharSet:" + charset);
String responseGet = new String(methodGet.getResponseBody(), charset);
System.out.println(responseGet);
Parser myParser = Parser.createParser(responseGet.toString(), charset);
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
System.out.println("title:" + textInPage);
}
public static String getCharSet(String content) {
// String regex = ".*charset=([^;]*).*";
String regex = "<meta.+?charset=[^\\w]?([-\\w]+)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if (matcher.find())
return matcher.group(1);
else
return null;
}
}
使用httpClient和httpParser获取指定网址的title
最新推荐文章于 2020-08-27 10:43:12 发布