java下载网页的方法主要有两种:java自带的HttpURLConnection类和HttpClient类包,这两种方法有各自的好处,另外对于中文乱码的处理,本文在代码中有详细体现和比较,能够很好的消除中文乱码问题,供大家参考。下面就让我们在代码中领悟吧!
方法1:HttpURLConnection的两种不同解码方式
package com.learn.http.impl;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import com.learn.http.Http;
import com.learn.util.SingleMatch;
public class HttpURLConnectionImp1 implements Http {
/**
* 采用Java自带的HttpURLConnection,优点:方便,不用导入其他包
* 缺点:在该方法中虽然对编码进行了转换,但是由于缓冲区大小的确定,如大小为1024字节,有可能会引起文字的切割不正确导致部分中文字乱码
* @param pageUrl
* @param encoding
* @return
*/
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
URL url = null;
HttpURLConnection conn = null;
InputStream in = null;
StringBuffer sb = null;
try {
url = new URL(pageUrl);
conn = (HttpURLConnection) url.openConnection();
sb = new StringBuffer();
if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
in = conn.getInputStream();
byte[] buf = new byte[1024];
int len = 0;
while ((len=in.read(buf)) != -1)
sb.append(new String(buf, 0, len, encoding));
in.close();
}
else System.err.println("访问网络失败!"+conn.getResponseCode());
} catch (MalformedURLException e) {
System.err.println("url格式不规范:"+e.getMessage());
} catch (IOException e) {
System.err.println("IO操作错误:"+e.getMessage());
}
return sb.toString();
}
public String getHtml(String url) {
String html = "";
String firstEncoding = "utf-8";
html = getHtmlcodeWithoutHeader(url, firstEncoding);
String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
if(encoding.toLowerCase().equals(firstEncoding)){
}else if(encoding==""){
}else{
html = getHtmlcodeWithoutHeader(url, encoding);
}
return html;
}
}
package com.learn.http.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import com.learn.http.Http;
import com.learn.util.SingleMatch;
public class HttpURLConnectionImp2 implements Http{
/**
* 该方法可以很好的解决中文乱码问题,同样采用java自带的HttpURLConnection类,方便
*/
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
//Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8580)); //设置代理服务器
StringBuffer sb = new StringBuffer();
try {
URL url = new URL(pageUrl);
//HttpURLConnection conn = (HttpURLConnection) url.openConnection(proxy);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStream in = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in, encoding));
String line = null;
while((line=br.readLine())!=null){
sb.append(line);
sb.append("\r\n");
}
br.close();
in.close();
} catch (MalformedURLException e) {
System.err.println("url格式不规范:"+e.getMessage());
} catch (IOException e) {
System.err.println("IO操作错误:"+e.getMessage());
}
return sb.toString();
}
public String getHtml(String url) {
String html = "";
String firstEncoding = "utf-8";
html = getHtmlcodeWithoutHeader(url, firstEncoding);
String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
if(encoding.toLowerCase().equals(firstEncoding)){
}else if(encoding==""){
}else{
html = getHtmlcodeWithoutHeader(url, encoding);
}
return html;
}
}
方法2:HttpClient的两种访问网页方式
package com.learn.http.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import com.learn.http.Http;
import com.learn.util.SingleMatch;
@SuppressWarnings("deprecation")
public class HttpClientImp1 implements Http {
/**
* 好像这个方法已经不推荐了,呵呵
*/
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
HttpClient client = new DefaultHttpClient();
HttpGet get = new HttpGet(pageUrl);
String s = null;
// HttpHost poxy = new HttpHost("127.0.0.1", 443);
try {
HttpResponse response = client.execute(get);
StatusLine status = response.getStatusLine();
System.out.println("状态行:" + status);
Header[] heads = response.getAllHeaders();
System.out.println("首部行:");
for (Header h : heads)
System.out.println("名称:" + h.getName() + " 值:" + h.getValue());
HttpEntity entity = response.getEntity();
InputStream in = null;
if (entity != null) {
in = entity.getContent();
s = inputStream2String(in, encoding);
in.close();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return s;
}
private String inputStream2String(InputStream in,String charset){
BufferedReader br;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new InputStreamReader(in,charset));
String line = "";
while ((line = br.readLine()) != null) {
sb.append(line + "\n");
}
br.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
public String getHtml(String url) {
String html = "";
String firstEncoding = "utf-8";
html = getHtmlcodeWithoutHeader(url, firstEncoding);
String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
if(encoding.toLowerCase().equals(firstEncoding)){
}else if(encoding==""){
}else{
html = getHtmlcodeWithoutHeader(url, encoding);
}
return html;
}
}
package com.learn.http.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import javax.net.ssl.HttpsURLConnection;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import com.learn.http.Http;
import com.learn.util.SingleMatch;
public class HttpClientImp2 implements Http{
/**
* 该方法需要导入commons-httpclient包,功能更加强大,设置代理服务器,翻墙更方便哟
*/
//static String PROXY_HOST = "127.0.0.1";
//static int PROXY_PORT = 8580;
static HttpClient client = null;
static {
client = new HttpClient();
/*client.getHostConfiguration().setProxy(PROXY_HOST,PROXY_PORT);
Credentials credentials = new Credentials() {}; //代理匿名认证 AuthScope
AuthScope authscope = new AuthScope(PROXY_HOST,PROXY_PORT);
client.getState().setProxyCredentials(authscope, credentials); */
//client.getParams().setAuthenticationPreemptive(true);
}
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
String response = null;
GetMethod getMethod = new GetMethod(pageUrl);
try {
client.executeMethod(getMethod);
if(getMethod.getStatusCode()==HttpsURLConnection.HTTP_OK){
InputStream in = getMethod.getResponseBodyAsStream();
response = inputStream2String(in, encoding);
in.close();
}else
System.err.println("访问网络失败!");
} catch (HttpException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return response;
}
private String inputStream2String(InputStream in,String charset){
BufferedReader br;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new InputStreamReader(in,charset));
String line = "";
while ((line = br.readLine()) != null) {
sb.append(line + "\n");
}
br.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
public String getHtml(String url) {
String html = "";
String firstEncoding = "utf-8";
html = getHtmlcodeWithoutHeader(url, firstEncoding);
String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
if(encoding.toLowerCase().equals(firstEncoding)){
}else if(encoding==""){
}else{
html = getHtmlcodeWithoutHeader(url, encoding);
}
return html;
}
}
每个类的方法注释中都有详细的优缺点说明,希望读者能从中受益。

下载源代码:java网页下载的四种不同实现方法