<%@ page contentType="text/html; charset=gb2312" language="java"
import="java.util.regex.*" errorPage=""%>
<%
String sCurrentLine;
String sTotalString;
sCurrentLine = "";
sTotalString = "";
java.io.InputStream l_urlStream;
java.net.URL l_url = new java.net.URL("http://www.baidu.com");
java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream = l_connection.getInputStream();
java.io.BufferedReader l_reader = new java.io.BufferedReader(
new java.io.InputStreamReader(l_urlStream));
while ((sCurrentLine = l_reader.readLine()) != null) {
sTotalString += sCurrentLine;
}
//String regEx = "href=([^/"']*)>";
String regEx = "href=/"([^/"]*)/"";//找出href="****"的链接
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(sTotalString);
int j = 0;
while (m.find()) {
j++;
out.println("m.group(" + j + "): " + m.group(0) + "<br>");
}
regEx = "href='([^']*)'";//找出href='****'的链接
p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);//Pattern.CASE_INSENSITIVE查找忽略大小写
m = p.matcher(sTotalString);
while (m.find()) {
j++;
out.println("m.group(" + j + "): " + m.group(0) + "<br>");
}
%>
import="java.util.regex.*" errorPage=""%>
<%
String sCurrentLine;
String sTotalString;
sCurrentLine = "";
sTotalString = "";
java.io.InputStream l_urlStream;
java.net.URL l_url = new java.net.URL("http://www.baidu.com");
java.net.HttpURLConnection l_connection = (java.net.HttpURLConnection) l_url
.openConnection();
l_connection.connect();
l_urlStream = l_connection.getInputStream();
java.io.BufferedReader l_reader = new java.io.BufferedReader(
new java.io.InputStreamReader(l_urlStream));
while ((sCurrentLine = l_reader.readLine()) != null) {
sTotalString += sCurrentLine;
}
//String regEx = "href=([^/"']*)>";
String regEx = "href=/"([^/"]*)/"";//找出href="****"的链接
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(sTotalString);
int j = 0;
while (m.find()) {
j++;
out.println("m.group(" + j + "): " + m.group(0) + "<br>");
}
regEx = "href='([^']*)'";//找出href='****'的链接
p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);//Pattern.CASE_INSENSITIVE查找忽略大小写
m = p.matcher(sTotalString);
while (m.find()) {
j++;
out.println("m.group(" + j + "): " + m.group(0) + "<br>");
}
%>
本文通过Java实现了一个简单的网页抓取程序,该程序从指定URL(如百度首页)抓取页面内容,并利用正则表达式匹配出页面中所有的超链接。本文重点介绍了如何使用HttpURLConnection连接网页并读取内容,以及如何使用Pattern和Matcher类来解析和提取超链接。
3万+

被折叠的 条评论
为什么被折叠?



