java下载网页的方法

本文介绍了Java下载网页的两种方法——HttpURLConnection和HttpClient,并重点讨论了如何处理中文乱码问题,提供了详细的代码示例。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

    java下载网页的方法主要有两种:java自带的HttpURLConnection类和HttpClient类包,这两种方法有各自的好处,另外对于中文乱码的处理,本文在代码中有详细体现和比较,能够很好的消除中文乱码问题,供大家参考。下面就让我们在代码中领悟吧!

方法1HttpURLConnection的两种不同解码方式

package com.learn.http.impl;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import com.learn.http.Http;
import com.learn.util.SingleMatch;
public class HttpURLConnectionImp1 implements Http {
/**
 * 采用Java自带的HttpURLConnection,优点:方便,不用导入其他包
 * 缺点:在该方法中虽然对编码进行了转换,但是由于缓冲区大小的确定,如大小为1024字节,有可能会引起文字的切割不正确导致部分中文字乱码
 * @param pageUrl
 * @param encoding
 * @return
 */
	public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
		URL url = null;
		HttpURLConnection conn = null;
		InputStream in = null;
		StringBuffer sb = null;
		try {
			url = new URL(pageUrl);
			conn = (HttpURLConnection) url.openConnection();
			sb = new StringBuffer();
			if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
				in = conn.getInputStream();
				byte[] buf = new byte[1024];
				int len = 0;
				while ((len=in.read(buf)) != -1)
				sb.append(new String(buf, 0, len, encoding));
				in.close();
			}
			else System.err.println("访问网络失败!"+conn.getResponseCode());
		} catch (MalformedURLException e) {
			System.err.println("url格式不规范:"+e.getMessage());
		} catch (IOException e) {
			System.err.println("IO操作错误:"+e.getMessage());
		}
		return sb.toString();
	}

	public String getHtml(String url) {
		String html = "";
		String firstEncoding = "utf-8";
		html = getHtmlcodeWithoutHeader(url, firstEncoding);
		String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
		if(encoding.toLowerCase().equals(firstEncoding)){
		}else if(encoding==""){
			
		}else{
			html = getHtmlcodeWithoutHeader(url, encoding);
		}
		return html;
	}
}

package com.learn.http.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import com.learn.http.Http;
import com.learn.util.SingleMatch;

public class HttpURLConnectionImp2 implements Http{
/**
 * 该方法可以很好的解决中文乱码问题,同样采用java自带的HttpURLConnection类,方便
 */
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
	//Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8580)); //设置代理服务器
	StringBuffer sb = new StringBuffer();
	try {
		URL url = new URL(pageUrl);
		//HttpURLConnection conn = (HttpURLConnection) url.openConnection(proxy);
		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
		InputStream in = conn.getInputStream();
		BufferedReader br = new BufferedReader(new InputStreamReader(in, encoding));
		String line = null;
		while((line=br.readLine())!=null){
			sb.append(line);
			sb.append("\r\n");
		}
		br.close();
		in.close();
	} catch (MalformedURLException e) {
		System.err.println("url格式不规范:"+e.getMessage());
	} catch (IOException e) {
		System.err.println("IO操作错误:"+e.getMessage());
	}
	return sb.toString();
	}

public String getHtml(String url) {
	String html = "";
	String firstEncoding = "utf-8";
	html = getHtmlcodeWithoutHeader(url, firstEncoding);
	String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
	if(encoding.toLowerCase().equals(firstEncoding)){
	}else if(encoding==""){
		
	}else{
		html = getHtmlcodeWithoutHeader(url, encoding);
	}
	return html;
}

}

方法2HttpClient的两种访问网页方式

package com.learn.http.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

import com.learn.http.Http;
import com.learn.util.SingleMatch;

@SuppressWarnings("deprecation")
public class HttpClientImp1 implements Http {
	/**
	 * 好像这个方法已经不推荐了,呵呵
	 */
	public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
		HttpClient client = new DefaultHttpClient();
		HttpGet get = new HttpGet(pageUrl);
        String s = null;
		// HttpHost poxy = new HttpHost("127.0.0.1", 443);
		try {
			HttpResponse response = client.execute(get);
			StatusLine status = response.getStatusLine();
			System.out.println("状态行:" + status);
			Header[] heads = response.getAllHeaders();
			System.out.println("首部行:");
			for (Header h : heads)
				System.out.println("名称:" + h.getName() + " 值:" + h.getValue());
			HttpEntity entity = response.getEntity();
			InputStream in = null;
			if (entity != null) {
				in = entity.getContent();
				s =  inputStream2String(in, encoding);
				in.close();
			}
		} catch (ClientProtocolException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		return s;
	}
	private String inputStream2String(InputStream in,String charset){
		BufferedReader br;
		StringBuffer sb = new StringBuffer();
			try {
				br = new BufferedReader(new InputStreamReader(in,charset));
				String line = "";
				while ((line = br.readLine()) != null) {
					sb.append(line + "\n");
				}
				br.close();
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
			return sb.toString();
	}
	public String getHtml(String url) {
		String html = "";
		String firstEncoding = "utf-8";
		html = getHtmlcodeWithoutHeader(url, firstEncoding);
		String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
		if(encoding.toLowerCase().equals(firstEncoding)){
		}else if(encoding==""){	
		}else{
			html = getHtmlcodeWithoutHeader(url, encoding);
		}
		return html;
	}
}
package com.learn.http.impl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import javax.net.ssl.HttpsURLConnection;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;

import com.learn.http.Http;
import com.learn.util.SingleMatch;

public class HttpClientImp2 implements Http{
/**
 * 该方法需要导入commons-httpclient包,功能更加强大,设置代理服务器,翻墙更方便哟
 */
	//static String PROXY_HOST = "127.0.0.1";
	//static int PROXY_PORT = 8580;
	static HttpClient client = null;
	static {
			client = new HttpClient();
			/*client.getHostConfiguration().setProxy(PROXY_HOST,PROXY_PORT);
			Credentials credentials = new Credentials() {}; //代理匿名认证 AuthScope
			AuthScope authscope = new AuthScope(PROXY_HOST,PROXY_PORT);
			client.getState().setProxyCredentials(authscope, credentials); */
			//client.getParams().setAuthenticationPreemptive(true);	
	}	
public String getHtmlcodeWithoutHeader(String pageUrl, String encoding) {
        String response = null;
        GetMethod getMethod = new GetMethod(pageUrl);
        try {
			client.executeMethod(getMethod);
			if(getMethod.getStatusCode()==HttpsURLConnection.HTTP_OK){
				InputStream in = getMethod.getResponseBodyAsStream();
			    response = inputStream2String(in, encoding);
			    in.close();
			}else
				System.err.println("访问网络失败!");
		} catch (HttpException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return response;
	}
private String inputStream2String(InputStream in,String charset){
	BufferedReader br;
	StringBuffer sb = new StringBuffer();
		try {
			br = new BufferedReader(new InputStreamReader(in,charset));
			String line = "";
			while ((line = br.readLine()) != null) {
				sb.append(line + "\n");
			}
			br.close();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return sb.toString();
}
public String getHtml(String url) {
	String html = "";
	String firstEncoding = "utf-8";
	html = getHtmlcodeWithoutHeader(url, firstEncoding);
	String encoding = SingleMatch.match(html, "charset=\"?(\\S*?)\"|charset='?(\\S*?)'");
	if(encoding.toLowerCase().equals(firstEncoding)){
	}else if(encoding==""){
		
	}else{
		html = getHtmlcodeWithoutHeader(url, encoding);
	}
	return html;
}
}
  每个类的方法注释中都有详细的优缺点说明,希望读者能从中受益。 微笑
  下载源代码:java网页下载的四种不同实现方法



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值