/** *//** * * @author 晓峰2007.1.18 抓取雅虎知识堂的文章标题及内容(测试) 手动输入网址抓取,可进一步自动抓取整个知识堂的全部内容 * */ public class WebContent ...{ /** *//** * 读取一个网页全部内容 */ public String getOneHtml(String htmlurl) throws IOException...{ URL url; String temp; StringBuffer sb = new StringBuffer(); try ...{ url = new URL(htmlurl); BufferedReader in = new BufferedReader(new InputStreamReader(url .openStream(), "utf-8"));// 读取网页全部内容 while ((temp = in.readLine()) != null) ...{ sb.append(temp); } in.close(); }catch(MalformedURLException me)...{ System.out.println("你输入的URL格式有问题!请仔细输入"); me.getMessage(); throw me; }catch (IOException e) ...{ e.printStackTrace(); throw e; } return sb.toString(); }
/** *//** * * @param s * @return 获得网页标题 */ public String getTitle(String s) ...{ String regex; String title = ""; List<String> list = new ArrayList<String>(); regex = "<title>.*?</title>"; Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); Matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } for (int i = 0; i < list.size(); i++) ...{ title = title + list.get(i); } return outTag(title); }
/** *//** * * @param s * @return 获得链接 */ public List<String> getLink(String s) ...{ String regex; List<String> list = new ArrayList<String>(); regex = "<a[^>]*href=("([^"]*)"|'([^']*)'|([^\s>]*))[^>]*>(.*?)</a>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; }
/** *//** * * @param s * @return 获得脚本代码 */ public List<String> getScript(String s) ...{ String regex; List<String> list = new ArrayList<String>(); regex = "<script.*?</script>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; }
/** *//** * * @param s * @return 获得CSS */ public List<String> getCSS(String s) ...{ String regex; List<String> list = new ArrayList<String>(); regex = "<style.*?</style>"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(s); while (ma.find()) ...{ list.add(ma.group()); } return list; }
/** *//** * * @param s * @return 去掉标记 */ public String outTag(String s) ...{ return s.replaceAll("<.*?>", ""); }
/** *//** * * @param s * @return 获取雅虎知识堂文章标题及内容 */ public HashMap<String, String> getFromYahoo(String s) ...{ HashMap<String, String> hm = new HashMap<String, String>(); StringBuffer sb = new StringBuffer(); String html=""; System.out.println(" ------------------开始读取网页(" + s + ")--------------------"); try...{ html= getOneHtml(s); }catch(Exception e)...{ e.getMessage(); } // System.out.println(html); System.out.println("------------------读取网页(" + s + ")结束-------------------- "); System.out.println("------------------分析(" + s + ")结果如下-------------------- "); String title = outTag(getTitle(html)); title = title.replaceAll("_雅虎知识堂", ""); // Pattern pa=Pattern.compile("<div // class="original">(.*?)(( )*)(.*?)(( )*)(.*?)</div>",Pattern.DOTALL); Pattern pa = Pattern.compile("<div class="original">(.*?)</p></div>", Pattern.DOTALL); Matcher ma = pa.matcher(html); while (ma.find()) ...{ sb.append(ma.group()); } String temp = sb.toString(); temp = temp.replaceAll("(<br>)+?", " ");// 转化换行 temp = temp.replaceAll("<p><em>.*?</em></p>", "");// 去图片注释 hm.put("title", title); hm.put("original", outTag(temp)); return hm;