import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* java实现简单爬取网址
*/
public class Demo {
public static void main(String[] args) {
URL url = null;
URLConnection urlconn = null;
BufferedReader br = null;
PrintWriter pw = null;
String regex ="(http://|ftp://|https://|www.)([^\u4e00-\u9fa5\\s]+\\.)+[^\u4e00-\u9fa5\\s<>\\}\\]]+";
Pattern p = Pattern.compile(regex);
try {
url = new URL("https://blog.youkuaiyun.com/Xiaolu_Yiren/article/details/74202634");//爬取的网址
urlconn = url.openConnection();
pw = new PrintWriter(new FileWriter("E:/学习文件/SiteURL.txt"), true);//将爬取到的链接放到目标文件中
br = new BufferedReader(new InputStreamReader(
urlconn.getInputStream()));
String buf = null;
while ((buf = br.readLine()) != null) {
Matcher buf_m = p.matcher(buf);
while (buf_m.find()) {
pw.println(buf_m.group());
}
}
System.out.println("爬取完成^_^");
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
pw.close();
}
}
}