String strUrl="http://www.cctv.com/tvguide/11/01/20061010/1.shtml";

byte[] pageHtml = HttpUtil.getPage(strUrl);
//将页面转成string
String strHtml = new String(pageHtml, "GB2312");
String[][] ls= null;
ls = StringUtil.splitByReg(strHtml,"(//d{2}://d{2}://d{2})</font>.*<font >(.+)</font>.*</tr>//r//n<tr>");
for(int i=0;i<ls.length;i++)

{
//String[] ls1[] = StringUtil.splitByReg(ls[i],"");
System.out.print(ls[i][0]+"##"+ls[i][1]);
System.out.println();
}

/** *//**通用正则表达式解析函数
* splitByReg
* @param str 需要解析的字符串
* @param regExp 匹配的正则表达式
* @return 解析后字符串数组
*/

public static String[][] splitByReg(String str,String regExp)
{
Pattern sp = Pattern.compile(regExp);
Matcher matcher = sp.matcher(str);
Vector<Vector<String>> colInoput= new Vector<Vector<String>>();

while (matcher.find())
{
Vector<String> v = new Vector<String>();
for(int i=1;i <= matcher.groupCount();i++)

{

v.add(matcher.group(i));
}
colInoput.add(v);
}
String[][] resultList =null;
if(colInoput.size()>0)
resultList=new String[colInoput.size()][colInoput.get(0).size()];
for(int i=0;i< colInoput.size();i++)

{
String[] kk = new String[colInoput.get(i).size()];
colInoput.get(i).copyInto(kk);
resultList[i] = kk;
}
return resultList;
}

byte[] pageHtml = HttpUtil.getPage(strUrl);
//将页面转成string
String strHtml = new String(pageHtml, "GB2312");
String[][] ls= null;
ls = StringUtil.splitByReg(strHtml,"(//d{2}://d{2}://d{2})</font>.*<font >(.+)</font>.*</tr>//r//n<tr>");
for(int i=0;i<ls.length;i++)
{
//String[] ls1[] = StringUtil.splitByReg(ls[i],"");
System.out.print(ls[i][0]+"##"+ls[i][1]);
System.out.println();
}

/** *//**通用正则表达式解析函数
* splitByReg
* @param str 需要解析的字符串
* @param regExp 匹配的正则表达式
* @return 解析后字符串数组
*/
public static String[][] splitByReg(String str,String regExp)
{
Pattern sp = Pattern.compile(regExp);
Matcher matcher = sp.matcher(str);
Vector<Vector<String>> colInoput= new Vector<Vector<String>>();
while (matcher.find())
{
Vector<String> v = new Vector<String>();
for(int i=1;i <= matcher.groupCount();i++)
{
v.add(matcher.group(i));
}
colInoput.add(v);
}
String[][] resultList =null;
if(colInoput.size()>0)
resultList=new String[colInoput.size()][colInoput.get(0).size()];
for(int i=0;i< colInoput.size();i++)
{
String[] kk = new String[colInoput.get(i).size()];
colInoput.get(i).copyInto(kk);
resultList[i] = kk;
}
return resultList;
}
本文介绍了一种利用正则表达式从特定网址抓取并解析网页内容的方法。通过HTTP请求获取指定URL的内容后,采用GB2312编码方式将其转换为字符串形式,再运用自定义的splitByReg函数进行解析。该函数能够根据提供的正则表达式从字符串中抽取所需的数据。
362

被折叠的 条评论
为什么被折叠?



