import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinkParseManager extends AbstractPageManageStategy {
private final Pattern TEXT_PATTERN;
private final Pattern URL_PATTERN;
//private List<URLFormat> urlQueue;
List<String> urlHistory =null;
public LinkParseManager(Reader reader,
List<String> urlHistory) {
this.urlHistory=urlHistory;
this.links = new LinkedList<String>();
//this.urlQueue = urlQueue;
this.reader = reader;
this.TEXT_PATTERN = Pattern.compile(
"<a//b([^>]*)>(?:<[^>]+>)*(.*?)(?:<[^>]+>)*</a>",
Pattern.CASE_INSENSITIVE);
this.URL_PATTERN = Pattern.compile(
"//bHREF//s*=//s*(/"([^/"]*)/"|'([^']*)'|([^'/">//s]+))",
Pattern.CASE_INSENSITIVE);
}
public List<String> parseLinks() {
//urlHistory.add(downloadingURL.getUrlStr());
if(reader==null){return null;}
try {
String lineWords;
BufferedReader br = new BufferedReader(reader);
while ((lineWords = br.readLine()) != null) {
this.parseLink(lineWords);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return links;
}
public void parseLink(String lineWords) {
String sss="163";
CharSequence cs =sss.subSequence(0, sss.length()-1);
String linkURL;
Matcher m = TEXT_PATTERN.matcher(lineWords);
if (m.find() && m.groupCount() > 1) {
String url = m.group(1);
Matcher m2 = URL_PATTERN.matcher(url);
if (m2.find()) {//&& m2.group(1).contains("http")&& m2.group(1).contains("163.com")
linkURL = m2.group(1).replaceAll("/"", "").replaceAll("'", "");
linkURL=linkURL.trim();
if(linkURL.startsWith("http://")&& linkURL.contains(cs)){
int i=0;
for(i=0;i<urlHistory.size();i++){
String st=urlHistory.get(i);
if(st.equals(linkURL)){
break;
}
}
if(i>=urlHistory.size()){
urlHistory.add(linkURL);
links.add(linkURL);
System.out.println(linkURL);
}
}
}
}
}
}
本文介绍了一个链接解析器的设计与实现,该解析器能够从文本中提取符合特定条件的URL,并将其加入到历史记录中以备后续处理。解析器使用正则表达式匹配HTML中的链接,并检查这些链接是否指向特定网站。
1675

被折叠的 条评论
为什么被折叠?



