java写爬虫
数据库:mongobd+robo 3T;
工具:eclipse;
架构:无;
核心思想:启动懒人模式,可复用的方法全部做成接口;
界面(可选):swing;
模块一:get方法获取网页源码
备注:建立连接的参数,可以用fiddler抓包工具,来获取
public StringBuilder getInputStream(String Url,String Get,String Accept,String Cookie,String BM){
StringBuilder resultStr=new StringBuilder();
try {
URL realUrl = new URL(Url);
HttpURLConnection connection=(HttpURLConnection) realUrl.openConnection();
// connection.setRequestMethod(Get);//设置通用请求属性
connection.setRequestProperty("Connection", "keep-alive");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
connection.setRequestProperty("Accept", Accept);
if(Cookie!=null){
connection.setRequestProperty("Cookie", Cookie);
}
// 建立实际的连接
connection.connect();
BufferedReader in=new BufferedReader(new InputStreamReader(connection.getInputStream(),BM));//定义BufferedReader输入流读取URL响应
String line=null;
while((line=in.readLine())!=null){
resultStr.append(line);//直接添加在字符串末尾
}
}catch(Exception e){
// TODO Auto-generated catch block
e.printStackTrace();
}
// System.out.println("输入流:"+resultStr);
return resultStr;
}
模块二:post方法获取网页源代码(json格式数据)
public static JSONObject post(String strURL,String param){//url和json字符串firstbao
String str = null;
JSONObject str1 = null;
PrintWriter out = null;
try {
URL url=new URL(strURL);//创建连接
URLConnection connection=url.openConnection();
connection.setDoOutput(true);//这两项是post请求必输
connection.setDoInput(true);
connection.setUseCaches(true);//这两项随意
((HttpURLConnection) connection).setInstanceFollowRedirects(true);
((HttpURLConnection) connection).setRequestMethod("POST");//设置通用请求属性
//connection.setRequestProperty("Host", "http://www.cebpubservice.com");//这句会造成400错误
connection.setRequestProperty("Connection", "keep-alive");
//connection.setRequestProperty("Content-Length", "264");
connection.setRequestProperty("Accept", " */*");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36");
// OutputStream out=connection.getOutputStream();//获取connection 对象的输出流
// ((PrintStream) out).print(param);//发送请求参数
// //((PrintStream) out).print(param);
// out.flush();//flush输出流的缓冲流
// System.out.println(out);
// 获取URLConnection对象对应的输出流
out = new PrintWriter(connection.getOutputStream());
// 发送请求参数
out.print(param);
// flush输出流的缓冲
out.flush();
BufferedReader in=new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));//定义BufferedReader输入流读取URL响应
str = in.readLine();
//得到一个字符串,转换成json格式即可
str1 = JSONObject.parseObject(str);
System.out.println("post请求返回的String类型json"+str);//这里之前是返回String,后面不行,改成返回json
System.out.println("post请求返回的json格式的json"+str1);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("post 获取分页或内容页出错");
}
return str1;
}
模块三:数据处理(这个需要特定情况特殊处理),来个实例如下
private static List<Document> dataProcessing(String originalcontent) {
//项目名称
title=CommonDataProcessing.getTitle(originalcontent,title);
title=title.replaceAll("招标公告", "项目");
title=title.replaceAll("-招标信息-中国招标信息网", "");
//项目内容
content=CommonDataProcessing.getContent(originalcontent, content);
//正则表达式匹配规则
String[] content1s=new String[]{"采购人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采 购 人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招 标 人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","建设单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人[0-9].[0-9].*[0-9]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采购单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人信息 名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","项目单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","受[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]+委托","由[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]+委托","招标人[\u4E00-\u9FA5]*公司","招标人[\u4E00-\u9FA5]*法[\u4E00-\u9FA5]+代表","采购[\u4E00-\u9FA5]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标[\u4E00-\u9FA5]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采购[\u4E00-\u9FA5]*名称[\u4E00-\u9FA5]*采购[\u4E00-\u9FA5]*地址"};
String[] content4s=new String[]{"代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司","代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*事务所","代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*中心","单位名称[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司","招标机构:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","全权委托[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司"};
//split 分割
// String[] array1=originalcontent.split(":");
// if(array1.length>6){
// for(int i=0;i<array1.length;i++){
// if(array1[i].contains("代理")){
// content4=array1[i+1];
// }
// }
// }
//采购人
for(int i=0;i<content1s.length;i++){
if(!ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content1s[i]).equals("")){
content1=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content1s[i]);
// break;
}
}
content1=content1.replaceAll("[\u4E00-\u9FA5]*:", "");
content1=content1.replaceAll("经办人", "");
content1=content1.replaceAll("执行机构", "");
content1=content1.replaceAll("地址", "");
content1=content1.replaceAll("招标人", "");
content1=content1.replaceAll("联系人", "");
content1=content1.replaceAll("采购人", "");
content1=content1.replaceAll("全权委托", "");
content1=content1.replaceAll("委托", "");
content1=content1.replaceAll("招 标 ", "");
content1=content1.replaceAll("受", "");
content1=content1.replaceAll("由", "");
content1=content1.replaceAll("法定代表", "");
//经办人
for(int i=0;i<content4s.length;i++){
if(!ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content4s[i]).equals("")){
content4=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content4s[i]);
}
}
content4=content4.replaceAll("[\u4E00-\u9FA5]*:", "");
content4=content4.replaceAll("代理机构", "");
content4=content4.replaceAll("地址", "");
content4=content4.replaceAll("名称", "");
content4=content4.replaceAll("联系方式", "");
content4=content4.replaceAll("账户信息", "");
content4=content4.replaceAll("项目联系人", "");
content4=content4.replaceAll("全权委托", "");
content4=content4.replaceAll("委托", "");
content4=content4.replaceAll("代理机构", "");
content4=content4.replaceAll("代理机构", "");
content4=content4.replaceAll("代理机构", "");
List<Document> documents=ConnetToDB.getZTBDocuments(title,content,content0,content1,content2,content3,content4,content5,content6);
return documents;
}
public static String getTitle(String originalcontent,String title){
title=originalcontent.substring(originalcontent.indexOf("<title>"), originalcontent.indexOf("</title>"));
title=title.replaceAll("招标公告", "项目");
title=title.replaceAll("-招标信息-中国招标信息网", "");
title=title.replaceAll("<title>", "");
return title;
}
public static String getContent(String originalcontent,String content){
content=originalcontent.replaceAll("<[^>]+>","");
content=content.replaceAll(" ","");
content=content.replaceAll(" ","");
content=content.replaceAll(" ","");
return content;
}
模块4:连接db数据库,获取实例对象,插入操作等可以通过对象来实现
collection.insertMany(documents);//即可插入到数据库
注意引入包:
import org.bson.Document;
import com.mongodb.client.MongoCollection;
驱动:mongo-java-driver 添加路径
//操作方法1--连接服务--数据库--返回集合对象
//new MongoClient("localhost",27017);
public static MongoCollection<Document> ConnetToCollection(String host,String dataBase,int port,String Collection){
// 连接Mongodb服务(服务名,端口)
MongoClient mongoClient =new MongoClient(host,port);
//连接到数据库(数据库名)
MongoDatabase mongoDatabase =mongoClient.getDatabase(dataBase);
//获取集合(集合名)
MongoCollection<Document> collection =mongoDatabase.getCollection(Collection);
return collection;
//操作方法1--集合里插入文档
// collection.insertMany(documents);
}
模块五:主体方法,循环获取url,暂不赘述,正则表达式可以实现;还是给个实例
//六、招标信息网
for(int s=11;s<=40;s++){
String aString="";
//初始页需要看情况进行修改
//province=:云南40、
//陕西30、宁夏31、甘肃32、新疆33、青海34、西藏35、天津36、重庆37、内蒙古38、黑龙江39
//福建20、江苏21、浙江22、安徽23、贵州24、四川25、江西26、海南27、吉林28、辽宁29、
//北京10(1108页)、上海11、广东12、广西13、湖南14、湖北15、山西16、山东17、河南18、河北19、
switch (s) {
case 10:aString="北京";break;
case 11:aString="上海";break;
case 12:aString="广东";break;
case 13:aString="广西";break;
case 14:aString="湖南";break;
case 15:aString="湖北";break;
case 16:aString="山西";break;
case 17:aString="山东";break;
case 18:aString="河南";break;
case 19:aString="河北";break;
case 20:aString="福建";break;
case 21:aString="江苏";break;
case 22:aString="浙江";break;
case 23:aString="安徽";break;
case 24:aString="贵州";break;
case 25:aString="四川";break;
case 26:aString="江西";break;
case 27:aString="海南";break;
case 28:aString="吉林";break;
case 29:aString="辽宁";break;
case 30:aString="陕西";break;
case 31:aString="宁夏";break;
case 32:aString="甘肃";break;
case 33:aString="新疆";break;
case 34:aString="青海";break;
case 35:aString="西藏";break;
case 36:aString="天津";break;
case 37:aString="重庆";break;
case 38:aString="内蒙古";break;
case 39:aString="黑龙江";break;
case 40:aString="云南";break;
}
int noYeMax=10;//初始化总页数
for(int noYe=1;noYe<=noYeMax;noYe++){//最多1108页
System.out.println(noYeMax);
//http://www.cnbidding.com/notice/search.php?page=2&industry=¬icetype=&bidtype=&usetype=&province=10&keyword=&down=
String urlZhaoBiaoXinXiFirst="http://www.cnbidding.com/notice/search.php?page="+noYe+"&industry=¬icetype=&bidtype=&usetype=&province="+s+"&keyword=&down= ";
//get请求获取返回源码已写好,可以直接调用
StringBuilder StringBuilderZhaoBiaoXinXiFirst=new GetInputStream().getInputStream(urlZhaoBiaoXinXiFirst, Get, Accept, Cookie, BM);//模拟进入分页,获取分页返回的json
// System.out.println("***首页输入流***"+firstPageStringBuilder);
String contentZhaoBiaoXinXiFirst=StringBuilderZhaoBiaoXinXiFirst.toString().replaceAll("\"", "");
// System.out.println("去掉双引号:"+firstPageString);
//<td><h2><a href="http://www.cnbidding.com/notice/disp_bnid_n5af53a5b26cfc.html" target="_blank">
String ZhaoBiaoXinXiFirstGuiZe="http://www.cnbidding.com/notice/disp_bnid_+[0-9a-z]*.html";
ArrayList ZhaoBiaoXinXiUrls=ZhengZeBiaoDaShi.Listzhengzebiaodashi(contentZhaoBiaoXinXiFirst, ZhaoBiaoXinXiFirstGuiZe);
//获取分页的总页数
String noYeMaxGuize="共[0-9]*页";
String noYeMaxs=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(contentZhaoBiaoXinXiFirst, noYeMaxGuize);
noYeMaxs=noYeMaxs.replaceAll("共", "");
noYeMaxs=noYeMaxs.replaceAll("页", "");
noYeMax=Integer.valueOf(noYeMaxs);//总页数
for(int urlNo=0;urlNo<ZhaoBiaoXinXiUrls.size();urlNo++){
if(ConnetToDB.isExists(collection,"网址",ZhaoBiaoXinXiUrls.get(urlNo).toString())){
System.out.println("已在库中:"+ZhaoBiaoXinXiUrls.get(urlNo).toString());
}else{
content0=ZhaoBiaoXinXiUrls.get(urlNo).toString();//url赋值
StringBuilder stringBuilderZhaoBiaoXinXiJuTi=new GetInputStream().getInputStream(ZhaoBiaoXinXiUrls.get(urlNo).toString(), Get, Accept, Cookie, BM);
String contentZhaoBiaoXinXi=stringBuilderZhaoBiaoXinXiJuTi.toString().replaceAll("\"", "");
//数据处理,各字段存入文档,将文档存入数据库
List<Document> documents=dataProcessing(contentZhaoBiaoXinXi);
collection.insertMany(documents);
}
}
//存入数据库
}//页循环完毕
}//省循环完毕
}//结束
swing部分可选如下
//画布参数
static JPanel contentPaneGGZBFWPT = new JPanel();
static JPanel contentPaneQLM = new JPanel();
static JPanel contentPaneQGZBTB=new JPanel();
static JPanel contentPaneSZ=new JPanel();
static JPanel contentPaneNJ=new JPanel();
static JPanel contentPaneFirstPage=new JPanel();
public static void main(String[] args) {
//设置外观
try{
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
}catch(Exception e){
throw new RuntimeException(e);
}
//run代码
EventQueue.invokeLater(new Runnable() {
public void run() {
try {
//初始页显示,只有标签页和招标公共服务
// LogSvr2.mmain();//写入日志
JFormFirst frameFirst = new JFormFirst();
frameFirst.setVisible(true);
} catch (Exception e) {
e.printStackTrace();
}
}
});
}
public JFormFirst() {
/*
* 各种属性设置
* */
setTitle("Welcom to the World of Spiderman");//窗体名为Spider
//setSize(1000, 1000);//窗体大小
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
setBounds(100, 100, 1700, 500);//窗体大小
contentPaneGGZBFWPT.setLayout(null);
contentPaneQLM.setLayout(null);
contentPaneQGZBTB.setLayout(null);
contentPaneSZ.setLayout(null);
contentPaneNJ.setLayout(null);
contentPaneFirstPage.setLayout(null);
contentPaneGGZBFWPT.setBorder(new EmptyBorder(5, 5, 5, 5));
contentPaneQLM.setBorder(new EmptyBorder(5, 5, 5, 5));
contentPaneQGZBTB.setBorder(new EmptyBorder(5, 5, 5, 5));
contentPaneSZ.setBorder(new EmptyBorder(5, 5, 5, 5));
contentPaneNJ.setBorder(new EmptyBorder(5, 5, 5, 5));
contentPaneFirstPage.setBorder(new EmptyBorder(5, 5, 5, 5));
// setContentPane(contentPaneFirst);//只许一个,允许覆盖
setContentPane(contentPaneGGZBFWPT);
setContentPane(contentPaneQLM);
setContentPane(contentPaneQGZBTB);
setContentPane(contentPaneFirstPage);
contentPaneGGZBFWPT.setVisible(false);//决定是否显示
contentPaneQLM.setVisible(false);
contentPaneQGZBTB.setVisible(false);
contentPaneSZ.setVisible(false);
contentPaneNJ.setVisible(false);
contentPaneFirstPage.setVisible(true);
TextFieldFirstUrl.setBounds(300, 24, 300, 24);
contentPaneFirstPage.add(TextFieldFirstUrl);
TextFieldFirstUrl.setColumns(10);
//起始页url配置参数按钮
JButton ButtonCanshu = new JButton("(*)");
ButtonCanshu.addMouseListener(new MouseAdapter() {
@Override
public void mouseClicked(MouseEvent arg0) {
//向光标位置插入(*)
int pos =TextFieldFirstUrl.getCaretPosition();
// System.out.println("pos:"+pos);
if(pos>0){//中间位置插入
TextFieldFirstUrl.setText(firstUrl.substring(0, pos)+"(*)"+firstUrl.substring(pos, firstUrl.length()));
}else{//首位插入
TextFieldFirstUrl.setText("(*)");
}
}
});
ButtonCanshu.setBounds(600, 24, 60, 24);
contentPaneFirstPage.add(ButtonCanshu);
//开始按钮
JButton ButtonFirstPageStart = new JButton("启动");
ButtonFirstPageStart.addMouseListener(new MouseAdapter() {
@Override
public void mouseClicked(MouseEvent arg0){
System.out.println("url:"+firstUrl+"\r\n"+"关键字:"+rule+"\r\n"+"请求方式:"+Get+"\r\n"+"起始页:"+startNoPage+"\r\n"+"结束页:"+endNoPage);
// param.put("firstUrl", firstUrl);
// param.put("rule", rule);
// param.put("Get", Get);
// param.put("startNoPage", startNoPage);
// param.put("endNoPage", endNoPage);
//政府类网站
if(getType.equals("全文下载保存txt")){
boolean resultChencFirstSituation=FirstSituation.checkFirstSituation(firstUrl,Get,startNoPage,endNoPage,rule);
// JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
if(resultChencFirstSituation){
FirstSituation.firstSituation(firstUrl,Get,startNoPage,endNoPage,rule);
}else{
System.out.println("任务启动失败!");
}
}
if(getType.equals("关键字段下载保存excel")){
// SecondSituation.secondSituation(firstUrl,Get,startNoPage,endNoPage,rule);
// JsonNB.dateconsole();
JsonZhaoBiao2.dateconsole();
System.out.println("lalalal");
}
if(getType.equals("第三种情景")){
JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
System.out.println("开发未完成,待续");
}
if(getType.equals("第四种情景")){
JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
System.out.println("开发未完成,待续");
}
}
// }
});
ButtonFirstPageStart.setBounds(1250, 48, 100, 24);
contentPaneFirstPage.add(ButtonFirstPageStart);
}