java爬虫程序的可行度在很难赶得上主流的一些Python语言开发爬虫程序,但是java的用户量还是挺大的,下面介绍一下java爬虫的主要的一个过程:
上菜
package com.shaoyayu.html;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import com.shaoyayu.Databaseutil.DbUtil;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by shaoyayu on 2019/4/19.
*/
public class Collegelist {
/**
* 爬虫地址:
* http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=&page=1
* http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=contact&page=1
* 将高考数据输入到数据可中的表个数据将的展示
*/
public static void main(String[] ages){
aaaa("http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=&page=",228);
// XXBug.aatt("http://kaoshi.edu.sina.com.cn/college/collegelist/view?provid=&typeid=&pro=&tab=contact&page=",1);
// List<String> list = new ArrayList<>();
// for (int i = 0; i < 11; i++) {
// list.add("属性"+i);
// }
// inputData(list);
}
public static void aaaa(String url,int i){
int t= jiexi(getHtml(url+i));
if (t==1){
return;
}else {
i++;
aaaa(url,i);
}
}
public static String getHtml(String url){
CloseableHttpClient client = HttpClients.createDefault();
HttpGet get = new HttpGet(url);
try {
CloseableHttpResponse httpResponse = client.execute(get);
if (httpResponse.getStatusLine().getStatusCode()==200){
System.err.println(url);
System.err.println("请求成功");
String html = EntityUtils.toString(httpResponse.getEntity(),"UTF-8");
// FileWriter fileWriter = new FileWriter("E:/BugWeb/大学数据库.html");
// fileWriter.write(html);
// fileWriter.flush();
// fileWriter.close();
// System.err.println("文件输出完成");
return html;
}
} catch (IOException e) {
e.printStackTrace();
}
return "";
}
public static int jiexi(String html){
Document document = Jsoup.parse(html);
Element pageNumWrap = document.select("div.pageNumWrap").first();
String totalNum = pageNumWrap.attributes().get("totalNum");
String totalPage = pageNumWrap.attributes().get("totalPage");
String page = pageNumWrap.attributes().get("page");
if (totalNum==null&&totalNum.equals("0")){
return 0;
}
Elements college_info = document.select("div.college_info");
int t=0;
for (Element element:college_info) {
List<String> lists = new ArrayList<>();
Elements div_clearfix = element.select("div.clearfix");
String presentationUrl = div_clearfix.select("[target='_blank']").first().attributes().get("href").trim();
String schoolName = div_clearfix.select("h4.college_name").text().trim();
lists.add(presentationUrl);
lists.add(schoolName);
Elements str1 = div_clearfix.select("a[target='_blank'].weibo");
String officialMicroblog = "";
for (Element e : str1) {
officialMicroblog =officialMicroblog.concat(e.attributes().get("href")+":"+e.text()+":");
}
lists.add(officialMicroblog);
Elements div_grays = div_clearfix.select("div.gray");
try {
for (Element element1:div_grays) {
Elements p_geay = element1.children();
if (p_geay.get(0)!=null){
if (p_geay.get(0).text().trim().split(":")[1]!=null) {
String dizi = p_geay.get(0).text().trim().split(":")[1].trim();
lists.add(dizi);
}
}
if (p_geay.get(1)!=null){
if (p_geay.get(1).text().trim().split(":").length>1){
if (p_geay.get(1).text().trim().split(":")[1]!=null){
String xueke = p_geay.get(1).text().trim().split(":")[1].trim();
lists.add(xueke);
}
}
}
}
Elements span_orange = div_clearfix.select("span.orange");
String orange = span_orange.text();
lists.add(orange);
int i =0;
for (String string:lists) {
System.err.println(string+":"+i);
i++;
}
inputData(lists);
System.err.println(t++);
}catch (Exception e){
e.printStackTrace();
}finally {
continue;
}
}
if (totalPage.equals(page)){
return 1;
}
return 2;
}
private static void inputData(List<String> Strs) {
DbUtil dbUtil = new DbUtil();
Connection connection = null;
PreparedStatement ps = null;
if (Strs.size()==9){
for (;Strs.size()==9;){
Strs.add("");
}
}
//添加聊天信息
try {
//创建sql命令
connection = dbUtil.getConnection();
//创建SQL语句
String sql = "INSERT INTO tb_schools(" +
"presentationUrl,schoolName,officialMicroblog,location,keyDiscipline,category," +
"master,subjection,doctor,rests) " +
"values(?,?,?,?,?,?,?,?,?,?)";
//创建sql命令对象
ps = (PreparedStatement) connection.prepareStatement(sql);
for (int i = 1; i<Strs.size()+1;i++){
ps.setString(i,Strs.get(i-1));
}
ps.executeUpdate();
}catch (Exception e) {
e.printStackTrace();
}finally{
//关闭资源
try {
if (ps!=null) {
ps.close();
}
} catch (SQLException e) {
System.out.println("ps"+"关闭失败");
e.printStackTrace();
}
try {
if (connection!=null) {
connection.close();
}
} catch (SQLException e) {
System.out.println("connection"+"关闭失败");
e.printStackTrace();
}
}
}
}
这是我写个一个测试类,
最后我写一个线程池的方式陪上递归方式请求分页。。。
HttpClients+jsoup。配置一个Maven工程就可以跑了