[quote][size=small] 最经一直在做网页抓取工作,使用了自己写的HTML解析器。但是有个问题,就是在得到网页的html代码的时候老报内存溢出,结果程序总是停下来。以下是报的异常代码:[/size][/quote]
[quote]以下是得到网页html代码的类:WebPage[/quote]
[quote]以下是是用这个WebPage类的方法,就是用这个方法,老报异常。[/quote]
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2882)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:100)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:390)
at java.lang.StringBuffer.append(StringBuffer.java:224)
at slcx.com.WebPage.<init>(WebPage.java:67)
at slcx.stock.gegu.baidu.ExtBdGgHisRecInfo.执行百度个股历史抓取(ExtBdGgHisRecInfo.java:199)
at slcx.stock.gegu.baidu.ExtBdGgHisRecInfo.main(ExtBdGgHisRecInfo.java:272)
[quote]以下是得到网页html代码的类:WebPage[/quote]
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class WebPage {
private int connectOutTime = 5000;//链接超时时间
private int readOutTime = 10000; //读取超时时间
private String encoding = "gb2312";//网页内容编码
private String webPageCode = "";//网页代码
private int searchEnd = 0;// 查找最后的 索引
private int beginIndex = 0;// 开始 索引
public WebPage(String link) throws Exception {
URL url = new URL(link);
HttpURLConnection httpconn = (HttpURLConnection) url.openConnection();
httpconn.setConnectTimeout(connectOutTime);
httpconn.setReadTimeout(readOutTime);
BufferedReader br = new BufferedReader(new InputStreamReader(httpconn.getInputStream(), encoding));
StringBuffer sb = new StringBuffer();
String line = br.readLine();
while (line != null) {
sb.append(line + "\r\n");
line = br.readLine();
}
webPageCode = sb.toString();
this.searchEnd=sb.length();
}
public String getWebPageCode() {
return webPageCode;
}
}
[quote]以下是是用这个WebPage类的方法,就是用这个方法,老报异常。[/quote]
public void 执行百度个股历史抓取() {
DBConnection dbc = new DBConnection();
人员基本信息DAO ryjDao = new 人员基本信息DAO(dbc);
HashMap<String, Integer> splNameIdMap = ryjDao.得到荐股者名称和编号();
StockDAO stcDao = new StockDAO(dbc);
HashMap<String, String> stcNameCodeMap = stcDao.得到股票名称代码();
生成百度链接DAO scbDao = new 生成百度链接DAO(dbc);
百度股票信息采集DAO bdgDao = new 百度股票信息采集DAO(dbc);
ExtBdGgLnk bdg = new ExtBdGgLnk();
String datetimer=Date.get("yyyy-MM-dd HH:mm:ss");
Iterator<String> iter = splNameIdMap.keySet().iterator(); //iter长度有2100个左右
while (iter.hasNext()) {
String name = iter.next();
int id = splNameIdMap.get(name);
ArrayList<String> bdLinklist = scbDao.得到指定推荐者百度URL(id, name, 0);
if (bdLinklist == null || bdLinklist.isEmpty()) {
log.info("得到 "+name+ " 的百度搜索结果链接为空");
continue;
}
for (String bdLink : bdLinklist) {
log.info("现在抓取 " + name +" 的百度链接:" + bdLink);
ArrayList<String> linkList = bdg.getAllSearchLink(bdLink);
if (linkList == null || linkList.isEmpty()) {
log.info("该百度链接没有搜索结果:" + bdLink);
continue;
}
for (String link : linkList) {
log.info("现在开始抓:" + link + "的荐股信息信息");
int flag = bdgDao.插入链接(name,link,datetimer);
if (flag == 2) {
continue;
}
if (!isValidateLink(link)){
log.info("无效链接:"+link);
continue;
}
String webPageCode=null;
try {
WebPage web=new WebPage(link);
webPageCode = web.getWebPageCode();
} catch (Exception e) {
e.printStackTrace();
}
if (webPageCode == null || webPageCode.equals("")){
log.info("该链接没有得到网页代码:"+link);
continue;
}
String datetime = 得到推荐股票时间(webPageCode);
if (datetime == null || datetime.equals("")||datetime.compareTo("2008-01-01 00:00:00")<0){
log.info("该链接的时间无效:"+link);
continue;
}
String stockcode = 得到推荐股票代码(webPageCode, name,splNameIdMap, stcNameCodeMap);
if (stockcode == null || stockcode.equals("")){
log.info("该链接得到股票代码无效:"+link);
continue;
}
if (flag > 0) {
int row = bdgDao.更新一条荐股信息(stockcode, name, datetime,link, 0);
if (row > 0) {
log.info("成功抓取一条荐股:" + name + "\t" + datetime + "\t"+ stockcode + "\t" + link);
} else {
log.info("荐股信息插入失败:" + link);
}
}
}
scbDao.修改运行过的百度URL(bdLink, 1);
}
}
dbc.close();
}