抓取网页内容

package com.it.Day07;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.it.Day05.*;
public class Hpu {
	public static String GetResult(String url)
	{
		String result = "";
		BufferedReader in = null;
		try
		{
			URL realURL = new URL(url);
			URLConnection connection = realURL.openConnection();
			connection.connect();
			in = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
			//in = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
			String line;
			File file = new Txt().get();//一个简单的获得 文件的方法
			/*File file = new File("C:\\Users\\Sky\\Desktop\\Txt.txt");
			 * if(file.exists())
			 * file.delete()
			 * try(file.create)
			 * catch execption
			 * 
			 * */
			FileWriter writer = new FileWriter(file);
			while((line = in.readLine())!=null)
			{
				line = line.replaceAll("</?(?![iI][mM][gG]|[pP]\\b)([a-zA-Z]+)[^>]*?>", "");
				line = line.replaceAll("<br>", "\r\n");
				line = line.replaceAll(" ", "");
				writer.write(line+"\r\n");
				result+=line;
			}
			writer.close();
			return result;
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		try 
		{
			in.close();
		} 
		catch (IOException e) {
			e.printStackTrace();
		}
		return "hello world";
	}
	public static String RegexString(String target,String patternStr)
	{
		Pattern pattern = Pattern.compile(patternStr);
		Matcher matcher = pattern.matcher(target);
		if(matcher.find())
		{
			return matcher .group(1);
		}
		return "Hello World";
	}
	public static void main(String args[])
	{
		String url = "http://acm.hpu.edu.cn/problem.php?cid=1000&pid=0";
		String result = GetResult(url);
		result=result.replaceAll("</?(?![iI][mM][gG]|[pP]\\b)([a-zA-Z]+)[^>]*?>", "");
		result = result.replace(" ","");
		System.out.println(result);
	}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值