用JAVA实现新闻搜索

最新推荐文章于 2025-06-16 11:29:34 发布

hailang99

最新推荐文章于 2025-06-16 11:29:34 发布

阅读量1.4k

点赞数

CC 4.0 BY-SA版权

文章标签： java 数据库 string exception null sqlserver

本文链接：https://blog.youkuaiyun.com/hailang99/article/details/1792322

我想做一个新闻自动搜索的东西；
每天定时到网上去搜索，并且将搜索到的新闻保存到数据库中，
想叫大家提供一下思路；
该怎么样做比较好！！！

这是我写的一个获取体坛周报上的新闻的类，楼主可以参考一下，至于让它定时查询，你可以把它加到.bat文件里，用windows的计划任务执行，或者如 dabo1980(Java小菜鸟) 所说，用timer，不过我没试过。

* @author Wanghl

* Date: 2004-11-18

About: Internet News

Function: Gains the news which are gained from http://www.titansports.cn/world_football.php

and Process the news to get the subject,link URL,news details.

数据库news表脚本:

if exists (select * from dbo.sysobjects where id = object_id(N'[dbo].[news]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)

drop table [dbo].[news]

CREATE TABLE [dbo].[news] (

[subject] [text] COLLATE Chinese_PRC_CI_AS NULL ,

[link] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,

[content] [varchar] (3960) COLLATE Chinese_PRC_CI_AS NULL ,

[newsdate] [datetime] NULL ,

[bak] [text] COLLATE Chinese_PRC_CI_AS NULL

) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]

import java.net.*;

import java.io.*;

import java.sql.*;

import javax.sql.*;

import java.util.*;

public class news{

/*获取新闻并保存相应内容到c:/news/newsYYYYMMDDa(p)m.htm中*/

public static void main(String[] args) throws IOException

{

String addr="http://www.titansports.cn/world_football.php";

String tempstr=new String();

String strsum=new String();

URL tempurl=null;

InputStream in=null;

Calendar c=Calendar.getInstance();

String filename="news";

String year="",month="",day=""; //用作文件名的一部分,表具体哪一天原新闻

String apm="am"; //用作文件的一部分,表明是上半天还是下半天的新闻

year=c.get(Calendar.YEAR)+"";

month=c.get(Calendar.MONTH) +1+"";

day=c.get(Calendar.DAY_OF_MONTH)+"";

if (c.get(Calendar.HOUR_OF_DAY)>12)

apm="pm";

filename=filename+year+month+day+apm+".htm";

try{

tempurl=new URL(addr);

in=tempurl.openStream();

}

catch(Exception e){

System.out.println("An exception occured when try to connect the specified URL "+e.getMessage());

}

//System.out.println("Available is:"+in.available());

//InputStream in=tempurl.openStream();

BufferedReader buffer=new BufferedReader(new InputStreamReader(in));

try{

while ((tempstr=buffer.readLine())!=null)

{

strsum=strsum+tempstr;

}

}catch(Exception e){

System.out.println("IO Exception occured when read the stream"+e.getMessage());

}

PrintWriter writer= new PrintWriter(new BufferedWriter(new FileWriter("c:/news/"+filename,false)));

writer.write(strsum);

writer.close();

System.out.println("****获取首页到c:/news/"+filename+"完毕****");

buffer.close();

readNews(filename);

System.out.println("****首页--主题,链接,详细--入库等所有操作完毕!****");

}

/*分析所获取的新闻,读出其中的意甲部分,在此函数中分别调用newsDetails来获取详细和storenews存储所获数据*/

public static void readNews(String filen){

String newspieces[]=new String [200]; //将首页上所列出新闻以主题及其链接为单位保存在数组

String newslinks[]=new String[200]; //首页上所列出各新闻主题的相关链接

String newssubject[]=new String[200]; //首页上所列出各新闻主题

String newscontent[]=new String[200]; //首页上所列出各新闻主题的详细内容

String italynews=new String(); //只读取意甲新闻

String file=new String(); //获取输入流的整串字符

String temp=new String(); //临时字符串变量

URL tempurl=null;

boolean flag=false; //标志,数据库是否已包含该记录

int starts,ends,i; //记录意甲新闻开始和结束位置

Connection conn=null; //定义数据库连接

Statement stmt=null; //定义数据库操作表达式

ResultSet rtst=null; //定义结果记录集

String Qsql="select top 25 * from news order by newsdate desc";

try{

Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");//实例化数据库,获取数据库连接

conn=DriverManager.getConnection("jdbc:microsoft:sqlserver://218.206.73.9:1433;DatabaseName=pubs;useUnicode=true&characterEncoding=gb2312","sa","888600");

stmt=conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);

rtst=stmt.executeQuery(Qsql);

}catch(Exception e){

System.out.println("Exception: "+e.getMessage());

}

try

{

BufferedReader in=new BufferedReader(new FileReader("c:/news/"+filen));

while((temp=in.readLine())!=null)

{

file+=temp;

}

in.close();

}

catch(Exception e)

{

System.out.println(e);

e.printStackTrace();

}

if (file!=null && !"".equals(file))

{

starts=file.indexOf("  ");

ends=file.indexOf("");

System.out.println("****begin details news****");

italynews=file.substring(starts+12,ends); //以上述两行为标志,摘取意甲新闻部分

newspieces=italynews.split("<a href=/data/news/new_news");

//newspieces数组元素对应的内容除第一个为<talbe cellspaceing>....外,其余均为<a href .......>***</a>

for (i=0;i<200;i++ )

{

newslinks[i]="http://www.titansports.cn/data/news/new_news";

//每个新闻主题所对应的详细的前半部分路径

}

for (i=1;i<newspieces.length;i++)

//for (i=1;i<3;i++)

{

try{

newspieces[i].trim(); //以此为基础,在其中摘出新闻主题和相关主题链接

if(newspieces[i].indexOf(".html")>0){

newslinks[i]=newslinks[i]+newspieces[i].substring(0,newspieces[i].indexOf(".html")+5);

//获取主题新闻所对应的链接页地址并保存在newslinks数组中

rtst.beforeFirst();

while (rtst.next())

{

if (rtst.getString("link").equals(newslinks[i]))

{

newslinks[i]="exists!"; //判断此链接是否在数据库中已存在

break;

}

} //返回到记录集开头位置

if ((newslinks[i]=="exists!") || ("exists!".equals(newslinks[i])))

{

newssubject[i]="exists!"; //如果此链接已存在的话,将不作任何操作

newscontent[i]="exists!"; //获取单项主题的相关新闻详细内容

}

else

{

starts=newspieces[i].indexOf("> ");

ends=newspieces[i].indexOf("</a>");

newssubject[i]=newspieces[i].substring(starts+7,ends);

newscontent[i]=newsDetails(newslinks[i]); //获取单项主题的相关新闻详细内容

//存储新闻主题,URL,新闻Details到数据库

}

//System.out.println(newslinks[i]);

}

}catch (Exception e){

System.out.println("Exception"+e.getMessage());

}

try{

System.out.println("****获取新闻主题,URL,详细新闻完毕!***");

storenews(newssubject,newscontent,newslinks,i);

}catch (Exception e){

System.out.println("Exception"+e.getMessage());

}

/*获得主题新闻对应的详细新闻*/

public static String newsDetails(String linkaddr){

URL tempurl=null;

String news=new String(); //初始连接时获取的新闻页面流,返回时将是完整的汉字新闻

String tempstr=new String();//临时字符串1

String tempnews=new String();//临时字符串2

InputStream in2=null; //输入流

int pos1,pos2; //记录talbe 与</table>的位置

System.out.println("获取"+linkaddr+"新闻!");

try{

tempurl=new URL(linkaddr); //初始化连接

in2=tempurl.openStream(); //打开连接

}catch(Exception e){

System.out.println("Exception :"+e.getMessage());

}

BufferedReader buffer3=new BufferedReader(new InputStreamReader(in2));

//System.out.println("1");

try{

while ((tempstr=buffer3.readLine())!=null)

{

news=news+tempstr;//获取该连接详细新闻流

}

}catch (Exception e){

System.out.println(e.getMessage());

}

tempstr=news.substring(news.indexOf("    "),news.indexOf("<br><br><b>网络编辑:</b>"));

pos1=tempstr.indexOf("<table"); //去掉新闻的<table......</table>之间的字符

pos2=tempstr.indexOf("</table>");

tempnews=tempstr.substring(0,pos1-2);

tempnews=tempnews+tempstr.substring(pos2+8);

tempnews=tempnews.replaceAll("    "," ");//去掉其中的    

news=tempnews.replaceAll("<br><br>  "," ");

news=news.substring(2);//去掉<br><br>  "

return news.trim(); //返回完整的文字新闻Details

}

/*存储各新闻主题,URL,主题新闻对应的Details到news数据表*/

public static void storenews(String sub[],String con[],String link[],int len) throws SQLException,IOException{

Connection conn=null; //定义数据库连接

Statement stmt=null; //定义数据库操作表达式

String Isql="";

int i;

try{

Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");//实例化数据库,获取数据库连接

conn=DriverManager.getConnection("jdbc:microsoft:sqlserver://127.0.0.1:1433;DatabaseName=pubs;useUnicode=true&characterEncoding=gb2312","sa","888600");

stmt=conn.createStatement();

}catch(Exception e){

System.out.println("Exception: "+e.getMessage());

}

for (i=1;i<len;i++){