java爬虫

本文介绍用Java写爬虫的相关内容。使用MongoDB和Robo 3T数据库,借助Eclipse工具。核心思想是将可复用方法做成接口,可选Swing界面。包含获取网页源码的get和post方法、数据处理、连接数据库插入数据等模块,还提及驱动添加路径等注意事项。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

java写爬虫

数据库:mongobd+robo 3T;

工具:eclipse;

架构:无;

核心思想:启动懒人模式,可复用的方法全部做成接口;

界面(可选):swing;

模块一:get方法获取网页源码

备注:建立连接的参数,可以用fiddler抓包工具,来获取

public StringBuilder getInputStream(String Url,String Get,String Accept,String Cookie,String BM){
		StringBuilder resultStr=new StringBuilder();
		try {
		URL realUrl = new URL(Url);
		HttpURLConnection connection=(HttpURLConnection) realUrl.openConnection();
//		connection.setRequestMethod(Get);//设置通用请求属性
		connection.setRequestProperty("Connection", "keep-alive");
		connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
		connection.setRequestProperty("Accept", Accept);
		if(Cookie!=null){
			connection.setRequestProperty("Cookie", Cookie);
		}
		// 建立实际的连接
		connection.connect();
		BufferedReader in=new BufferedReader(new InputStreamReader(connection.getInputStream(),BM));//定义BufferedReader输入流读取URL响应
		String line=null;
		while((line=in.readLine())!=null){
			resultStr.append(line);//直接添加在字符串末尾
		}
		}catch(Exception e){
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
//		System.out.println("输入流:"+resultStr);
		return resultStr;
	}

模块二:post方法获取网页源代码(json格式数据)

public static JSONObject post(String strURL,String param){//url和json字符串firstbao
		String str = null;
		JSONObject str1 = null;
		PrintWriter out = null;
		try {
			URL url=new URL(strURL);//创建连接
			URLConnection connection=url.openConnection();
			connection.setDoOutput(true);//这两项是post请求必输
			connection.setDoInput(true);
			connection.setUseCaches(true);//这两项随意
			((HttpURLConnection) connection).setInstanceFollowRedirects(true);
			((HttpURLConnection) connection).setRequestMethod("POST");//设置通用请求属性
			//connection.setRequestProperty("Host", "http://www.cebpubservice.com");//这句会造成400错误
			connection.setRequestProperty("Connection", "keep-alive");
			//connection.setRequestProperty("Content-Length", "264");
			connection.setRequestProperty("Accept", " */*");
			connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36");
			
//			OutputStream out=connection.getOutputStream();//获取connection 对象的输出流
//			((PrintStream) out).print(param);//发送请求参数
//			//((PrintStream) out).print(param);
//			out.flush();//flush输出流的缓冲流
//			System.out.println(out);
			 // 获取URLConnection对象对应的输出流
            out = new PrintWriter(connection.getOutputStream());
            // 发送请求参数
            out.print(param);
            // flush输出流的缓冲
            out.flush();
			BufferedReader in=new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));//定义BufferedReader输入流读取URL响应
			str = in.readLine();
			//得到一个字符串,转换成json格式即可
	        str1 = JSONObject.parseObject(str);
			System.out.println("post请求返回的String类型json"+str);//这里之前是返回String,后面不行,改成返回json
			System.out.println("post请求返回的json格式的json"+str1);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			System.out.println("post 获取分页或内容页出错");
		}
	return str1;
}

模块三:数据处理(这个需要特定情况特殊处理),来个实例如下

private static List<Document> dataProcessing(String originalcontent) {
		//项目名称
		title=CommonDataProcessing.getTitle(originalcontent,title);
		title=title.replaceAll("招标公告", "项目");
		title=title.replaceAll("-招标信息-中国招标信息网", "");
		//项目内容
		content=CommonDataProcessing.getContent(originalcontent, content);
		//正则表达式匹配规则
		String[] content1s=new String[]{"采购人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采 购 人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招 标 人:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","建设单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人[0-9].[0-9].*[0-9]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采购单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标人信息 名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","项目单位:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","受[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]+委托","由[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]+委托","招标人[\u4E00-\u9FA5]*公司","招标人[\u4E00-\u9FA5]*法[\u4E00-\u9FA5]+代表","采购[\u4E00-\u9FA5]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","招标[\u4E00-\u9FA5]*名称:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","采购[\u4E00-\u9FA5]*名称[\u4E00-\u9FA5]*采购[\u4E00-\u9FA5]*地址"};
		String[] content4s=new String[]{"代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司","代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*事务所","代理[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*中心","单位名称[\u4E00-\u9FA5]*\\s*:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司","招标机构:\\s*[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*","全权委托[\u4E00-\u9FA5]*\\(*\\(*[\u4E00-\u9FA5]*\\)*\\)*[\u4E00-\u9FA5]*公司"};
		
		//split 分割
//		String[] array1=originalcontent.split(":");
//		if(array1.length>6){
//			for(int i=0;i<array1.length;i++){
//				if(array1[i].contains("代理")){
//					content4=array1[i+1];
//				}
//			}
//		}
		//采购人
		for(int i=0;i<content1s.length;i++){
			if(!ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content1s[i]).equals("")){
				content1=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content1s[i]);
//				break;
			}
		}
		content1=content1.replaceAll("[\u4E00-\u9FA5]*:", "");
		content1=content1.replaceAll("经办人", "");
		content1=content1.replaceAll("执行机构", "");
		content1=content1.replaceAll("地址", "");
		content1=content1.replaceAll("招标人", "");
		content1=content1.replaceAll("联系人", "");
		content1=content1.replaceAll("采购人", "");
		content1=content1.replaceAll("全权委托", "");
		content1=content1.replaceAll("委托", "");
		content1=content1.replaceAll("招 标 ", "");
		content1=content1.replaceAll("受", "");
		content1=content1.replaceAll("由", "");
		content1=content1.replaceAll("法定代表", "");
		//经办人
		for(int i=0;i<content4s.length;i++){
			if(!ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content4s[i]).equals("")){
				content4=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(originalcontent, content4s[i]);
			}
		}
		content4=content4.replaceAll("[\u4E00-\u9FA5]*:", "");
		content4=content4.replaceAll("代理机构", "");
		content4=content4.replaceAll("地址", "");
		content4=content4.replaceAll("名称", "");
		content4=content4.replaceAll("联系方式", "");
		content4=content4.replaceAll("账户信息", "");
		content4=content4.replaceAll("项目联系人", "");
		content4=content4.replaceAll("全权委托", "");
		content4=content4.replaceAll("委托", "");
		content4=content4.replaceAll("代理机构", "");
		content4=content4.replaceAll("代理机构", "");
		content4=content4.replaceAll("代理机构", "");
		List<Document> documents=ConnetToDB.getZTBDocuments(title,content,content0,content1,content2,content3,content4,content5,content6);
		return documents;
	}
public static String getTitle(String originalcontent,String title){
		title=originalcontent.substring(originalcontent.indexOf("<title>"), originalcontent.indexOf("</title>"));
		title=title.replaceAll("招标公告", "项目");
		title=title.replaceAll("-招标信息-中国招标信息网", "");
		title=title.replaceAll("<title>", "");
		return title;
	}
	
	public static String getContent(String originalcontent,String content){
		content=originalcontent.replaceAll("<[^>]+>","");
		content=content.replaceAll("&nbsp;","");
		content=content.replaceAll("  ","");
		content=content.replaceAll(" ","");
		return content;
	}

 

模块4:连接db数据库,获取实例对象,插入操作等可以通过对象来实现

collection.insertMany(documents);//即可插入到数据库

注意引入包:

import org.bson.Document;

import com.mongodb.client.MongoCollection;

驱动:mongo-java-driver 添加路径

//操作方法1--连接服务--数据库--返回集合对象
	//new MongoClient("localhost",27017);
	public static MongoCollection<Document> ConnetToCollection(String host,String dataBase,int port,String Collection){
		// 连接Mongodb服务(服务名,端口)
		MongoClient mongoClient =new MongoClient(host,port);
		//连接到数据库(数据库名)
		MongoDatabase mongoDatabase =mongoClient.getDatabase(dataBase);
		//获取集合(集合名)
        MongoCollection<Document> collection =mongoDatabase.getCollection(Collection);
        return collection;
        //操作方法1--集合里插入文档
//        collection.insertMany(documents);
	}


模块五:主体方法,循环获取url,暂不赘述,正则表达式可以实现;还是给个实例

	//六、招标信息网
		for(int s=11;s<=40;s++){
			String aString="";
			//初始页需要看情况进行修改
			//province=:云南40、
			//陕西30、宁夏31、甘肃32、新疆33、青海34、西藏35、天津36、重庆37、内蒙古38、黑龙江39
			//福建20、江苏21、浙江22、安徽23、贵州24、四川25、江西26、海南27、吉林28、辽宁29、
			//北京10(1108页)、上海11、广东12、广西13、湖南14、湖北15、山西16、山东17、河南18、河北19、
			switch (s) {
			case 10:aString="北京";break;
			case 11:aString="上海";break;
			case 12:aString="广东";break;
			case 13:aString="广西";break;
			case 14:aString="湖南";break;
			case 15:aString="湖北";break;
			case 16:aString="山西";break;
			case 17:aString="山东";break;
			case 18:aString="河南";break;
			case 19:aString="河北";break;
			case 20:aString="福建";break;
			case 21:aString="江苏";break;
			case 22:aString="浙江";break;
			case 23:aString="安徽";break;
			case 24:aString="贵州";break;
			case 25:aString="四川";break;
			case 26:aString="江西";break;
			case 27:aString="海南";break;
			case 28:aString="吉林";break;
			case 29:aString="辽宁";break;
			case 30:aString="陕西";break;
			case 31:aString="宁夏";break;
			case 32:aString="甘肃";break;
			case 33:aString="新疆";break;
			case 34:aString="青海";break;
			case 35:aString="西藏";break;
			case 36:aString="天津";break;
			case 37:aString="重庆";break;
			case 38:aString="内蒙古";break;
			case 39:aString="黑龙江";break;
			case 40:aString="云南";break;	
			}
			int noYeMax=10;//初始化总页数
		for(int noYe=1;noYe<=noYeMax;noYe++){//最多1108页
			System.out.println(noYeMax);
			//http://www.cnbidding.com/notice/search.php?page=2&industry=&noticetype=&bidtype=&usetype=&province=10&keyword=&down=
			String urlZhaoBiaoXinXiFirst="http://www.cnbidding.com/notice/search.php?page="+noYe+"&industry=&noticetype=&bidtype=&usetype=&province="+s+"&keyword=&down= ";
			//get请求获取返回源码已写好,可以直接调用
			StringBuilder StringBuilderZhaoBiaoXinXiFirst=new GetInputStream().getInputStream(urlZhaoBiaoXinXiFirst, Get, Accept, Cookie, BM);//模拟进入分页,获取分页返回的json
//			System.out.println("***首页输入流***"+firstPageStringBuilder);
			String contentZhaoBiaoXinXiFirst=StringBuilderZhaoBiaoXinXiFirst.toString().replaceAll("\"", "");
//			System.out.println("去掉双引号:"+firstPageString);
			//<td><h2><a href="http://www.cnbidding.com/notice/disp_bnid_n5af53a5b26cfc.html" target="_blank">
			String ZhaoBiaoXinXiFirstGuiZe="http://www.cnbidding.com/notice/disp_bnid_+[0-9a-z]*.html";
			ArrayList ZhaoBiaoXinXiUrls=ZhengZeBiaoDaShi.Listzhengzebiaodashi(contentZhaoBiaoXinXiFirst, ZhaoBiaoXinXiFirstGuiZe);
			//获取分页的总页数
			String noYeMaxGuize="共[0-9]*页";
			String noYeMaxs=ZhengZeBiaoDaShi.Stringzhengzebiaodashi(contentZhaoBiaoXinXiFirst, noYeMaxGuize);
			noYeMaxs=noYeMaxs.replaceAll("共", "");
			noYeMaxs=noYeMaxs.replaceAll("页", "");
			noYeMax=Integer.valueOf(noYeMaxs);//总页数
			
			for(int urlNo=0;urlNo<ZhaoBiaoXinXiUrls.size();urlNo++){
				if(ConnetToDB.isExists(collection,"网址",ZhaoBiaoXinXiUrls.get(urlNo).toString())){
					System.out.println("已在库中:"+ZhaoBiaoXinXiUrls.get(urlNo).toString());
				}else{
				content0=ZhaoBiaoXinXiUrls.get(urlNo).toString();//url赋值
				StringBuilder stringBuilderZhaoBiaoXinXiJuTi=new GetInputStream().getInputStream(ZhaoBiaoXinXiUrls.get(urlNo).toString(), Get, Accept, Cookie, BM);
				String contentZhaoBiaoXinXi=stringBuilderZhaoBiaoXinXiJuTi.toString().replaceAll("\"", "");
				//数据处理,各字段存入文档,将文档存入数据库
				List<Document> documents=dataProcessing(contentZhaoBiaoXinXi);
				collection.insertMany(documents);
				}
			}
			//存入数据库
			
		}//页循环完毕
	}//省循环完毕
}//结束

swing部分可选如下

//画布参数
	static JPanel contentPaneGGZBFWPT = new JPanel();
	static JPanel contentPaneQLM = new JPanel();
	static JPanel contentPaneQGZBTB=new JPanel();
	static JPanel contentPaneSZ=new JPanel();
	static JPanel contentPaneNJ=new JPanel();
	static JPanel contentPaneFirstPage=new JPanel();
public static void main(String[] args) {
		//设置外观
		try{
			UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
		}catch(Exception e){
			throw new RuntimeException(e);
		}
		//run代码
		EventQueue.invokeLater(new Runnable() {
			public void run() {
				try {
					//初始页显示,只有标签页和招标公共服务
//					LogSvr2.mmain();//写入日志
					JFormFirst frameFirst = new JFormFirst();
					frameFirst.setVisible(true);
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		});
	}


public JFormFirst() {
		/*
		 * 各种属性设置
		 * */
		setTitle("Welcom to the World of Spiderman");//窗体名为Spider
		//setSize(1000, 1000);//窗体大小
		setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
		setBounds(100, 100, 1700, 500);//窗体大小

		contentPaneGGZBFWPT.setLayout(null);
		contentPaneQLM.setLayout(null);
		contentPaneQGZBTB.setLayout(null);
		contentPaneSZ.setLayout(null);
		contentPaneNJ.setLayout(null);
		contentPaneFirstPage.setLayout(null);
		
		
		contentPaneGGZBFWPT.setBorder(new EmptyBorder(5, 5, 5, 5));
		contentPaneQLM.setBorder(new EmptyBorder(5, 5, 5, 5));
		contentPaneQGZBTB.setBorder(new EmptyBorder(5, 5, 5, 5));
		contentPaneSZ.setBorder(new EmptyBorder(5, 5, 5, 5));
		contentPaneNJ.setBorder(new EmptyBorder(5, 5, 5, 5));
		contentPaneFirstPage.setBorder(new EmptyBorder(5, 5, 5, 5));
//		setContentPane(contentPaneFirst);//只许一个,允许覆盖
		setContentPane(contentPaneGGZBFWPT);
		setContentPane(contentPaneQLM);
		setContentPane(contentPaneQGZBTB);
		setContentPane(contentPaneFirstPage);
		
		contentPaneGGZBFWPT.setVisible(false);//决定是否显示
		contentPaneQLM.setVisible(false);
		contentPaneQGZBTB.setVisible(false);
		contentPaneSZ.setVisible(false);
		contentPaneNJ.setVisible(false);
		contentPaneFirstPage.setVisible(true);

		TextFieldFirstUrl.setBounds(300, 24, 300, 24);
		contentPaneFirstPage.add(TextFieldFirstUrl);
		TextFieldFirstUrl.setColumns(10);
		//起始页url配置参数按钮
		JButton ButtonCanshu = new JButton("(*)");
		ButtonCanshu.addMouseListener(new MouseAdapter() {
			@Override
			public void mouseClicked(MouseEvent arg0) {
				//向光标位置插入(*)
				int pos =TextFieldFirstUrl.getCaretPosition();
//				System.out.println("pos:"+pos);
				if(pos>0){//中间位置插入
					TextFieldFirstUrl.setText(firstUrl.substring(0, pos)+"(*)"+firstUrl.substring(pos, firstUrl.length()));
				}else{//首位插入
					TextFieldFirstUrl.setText("(*)");
				}
			}
		});
		ButtonCanshu.setBounds(600, 24, 60, 24);
		contentPaneFirstPage.add(ButtonCanshu);
//开始按钮
		JButton ButtonFirstPageStart = new JButton("启动");
		ButtonFirstPageStart.addMouseListener(new MouseAdapter() {
			@Override
			public void mouseClicked(MouseEvent arg0){
				System.out.println("url:"+firstUrl+"\r\n"+"关键字:"+rule+"\r\n"+"请求方式:"+Get+"\r\n"+"起始页:"+startNoPage+"\r\n"+"结束页:"+endNoPage);
//				param.put("firstUrl", firstUrl);
//				param.put("rule", rule);
//				param.put("Get", Get);
//				param.put("startNoPage", startNoPage);
//				param.put("endNoPage", endNoPage);
					
				//政府类网站
				if(getType.equals("全文下载保存txt")){
					boolean resultChencFirstSituation=FirstSituation.checkFirstSituation(firstUrl,Get,startNoPage,endNoPage,rule);
//					JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
					if(resultChencFirstSituation){
						FirstSituation.firstSituation(firstUrl,Get,startNoPage,endNoPage,rule);
					}else{
							System.out.println("任务启动失败!");
						}
						
					}
					if(getType.equals("关键字段下载保存excel")){
//						SecondSituation.secondSituation(firstUrl,Get,startNoPage,endNoPage,rule);
//						JsonNB.dateconsole();
						JsonZhaoBiao2.dateconsole();
						System.out.println("lalalal");
					}
					if(getType.equals("第三种情景")){
						JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
						System.out.println("开发未完成,待续");
					}
					if(getType.equals("第四种情景")){
						JOptionPane.showMessageDialog(null, "开发未完成,待续");//父窗口,弹出框
						System.out.println("开发未完成,待续");
					}
				}
//			}
		});
		ButtonFirstPageStart.setBounds(1250, 48, 100, 24);
		contentPaneFirstPage.add(ButtonFirstPageStart);
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值