写写自己在比赛项目开发中学到的爬虫,从12306开始。
要做一个爬虫的话,一定要会看网页的源代码,学会用浏览器的抓包,一般情况先抓包,看有没有自己想要的数据,如果有返回的json的数据就好很多了,直接根据url规则编写链接,使用json解析返回的数据,不需要使用jsoup解析。像12306就是返回的json数据的。
可以复制链接地址出来:
https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2017-12-06&leftTicketDTO.from_station=SHH&leftTicketDTO.to_station=CSQ&purpose_codes=ADULT
注:12306的这个url会发生改变,所以爬虫代码里的这个url需要更新。
看到这个url,就很明了,我们需要传入的参数leftTicketDTO.train_date、leftTicketDTO.from_station、leftTicketDTO.to_station。多看几条就知道purpose_codes这个参数的值总是ADULT。
现在就是需要获得各个站点的编码:https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9030
我使用的是sql server数据库,数据库名称:TPS,用户:sa,密码:123456
数据库配置文件:
<?xml version="1.0" encoding="UTF-8"?>
<connections>
<connection>
<classname>com.microsoft.sqlserver.jdbc.SQLServerDriver</classname>
<url>jdbc:sqlserver://localhost:1433;databaseName=TPS</url>
<user>sa</user>
<password>123456</password>
</connection>
</connections>
数据库链接类:
package com.util;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.util.Iterator;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
public class DBUtil {
public static Connection conn;
//获取数据库连接
public static Connection getConn() {
String className = null;
String url = null;
String user = null;
String password = null;
//通过dom4j对数据库链接文件进行解析,获取驱动字符串、连接类。
try {
SAXReader reader = new SAXReader();
Document doc = reader.read(new File(DBUtil.class.getClassLoader()
.getResource("datebaseconfig.xml").getFile()));
Element root = doc.getRootElement();
Iterator<Element> it = root.elementIterator();
while (it.hasNext()) {
// 拿到单个子节点
Element connection = it.next();
// 获取子节点文本内容
className = connection.elementText("classname");
url = connection.elementText("url");
user = connection.elementText("user");
password = connection.elementText("password");
}
Class.forName(className);
conn = DriverManager.getConnection(url, user, password);
return conn;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
//关闭数据库连接
public static void CloseConn() {
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
忽略SSL链接:
package com.util;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;
//网络连接类
public class GetNetUtil {
// 忽略SSL证书
private static void trustAllHttpsCertificates() throws Exception {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
}
static class miTM implements javax.net.ssl.TrustManager,javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
}
/**
*
* @param urlAll
* :请求接口
* @param charset
* :字符编码
* @return 返回json结果
*/
public static String get(String urlAll, String charset) {
BufferedReader reader = null;
String result = null;
StringBuffer sbf = new StringBuffer();
String userAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";// 模拟浏览器
try {
trustAllHttpsCertificates();
HostnameVerifier hv = new HostnameVerifier() {
@Override
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName
+ " vs. " + session.getPeerHost());
return true;
}
};
HttpsURLConnection.setDefaultHostnameVerifier(hv);
URL url = new URL(urlAll);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.setReadTimeout(30000);
connection.setConnectTimeout(30000);
connection.setRequestProperty("User-agent", userAgent);
connection.connect();
InputStream is = connection.getInputStream();
reader = new BufferedReader(new InputStreamReader(is, charset));
String strRead = null;
while ((strRead = reader.readLine()) != null) {
sbf.append(strRead);
sbf.append("\r\n");
}
reader.close();
result = sbf.toString();
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
}
解析12306返回数据:
package com.util;
import java.util.ArrayList;
import java.util.List;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import com.dao.StationDAO;
public class Json12306Util {
public List json12306(String startCity, String arrCity,
String date){
List list = new ArrayList();
try {
StationDAO stationDAO = new StationDAO();
String startScode = stationDAO.findScodeBySname(startCity);
String arrScode = stationDAO.findScodeBySname(arrCity);
String charset = "UTF-8";
String urlname = "https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date="
+ date
+ "&leftTicketDTO.from_station="
+ startScode
+ "&leftTicketDTO.to_station="
+ arrScode
+ "&purpose_codes=ADULT";
System.out.println(urlname);
String jsonResult = GetNetUtil.get(urlname, charset);// 得到JSON字符串
System.out.println(jsonResult);
String message;
JSONObject obj = JSONObject.fromObject(jsonResult);// 转化为JSON类
/* 获取返回状态码 */
if (obj.containsKey("httpstatus")) {
message = obj.getString("httpstatus");
System.out.println("连接状况码:" + message);
/* 如果状态码是200说明返回数据成功 */
if (message != null && message.equals("200")) {
message = obj.getString("data");
System.out.println(message);
JSONObject object = JSONObject.fromObject(message);
message = object.getString("result");
System.out.println(message);
message = message.substring(message.indexOf("[") + 1,message.lastIndexOf("]"));
System.out.println(message);
for(String s : message.split(",")){
System.out.println(s);
s = s.substring(s.indexOf("\"") + 1,s.lastIndexOf("\""));
System.out.println(s);
String ss[] = s.split("\\|");
for(int i = 0;i < ss.length;i++){
System.out.println(ss[i]);
}
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
}
站点数据库操作类,需要在数据库中建立站点表(station),将12306站点(sname)与编码(scode)数据保存:
package com.dao;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import com.util.DBUtil;
public class StationDAO {
// 通过站点名查询站点编码
public String findScodeBySname(String sName) {
String s = null;
String sql = "select a.scode from station a where a.sname=?";
try {
Connection conn = DBUtil.getConn();
PreparedStatement ps = conn.prepareStatement(sql);
ps.setString(1, sName);
ResultSet rs = ps.executeQuery();
while (rs.next()) {
s = rs.getString("scode");
}
rs.close();
ps.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
return s;
}
}
测试类:
import java.util.Scanner;
import com.util.Json12306Util;
public class Test {
public static void main(String[] args) {
Json12306Util js = new Json12306Util();
Scanner s = new Scanner(System.in);
System.out.println("请输入出发城市,例如:北京");
String startCity = s.next();
System.out.println("请输入到达城市,例如:天津");
String arrCity = s.next();
System.out.println("请输入出发日期:例如:2017-12-10");
String date = s.next();
js.json12306(startCity, arrCity, date);
}
}
运行结果:
程序可能有考虑不全面,或者有bug,欢迎大家指正。
代码已经上传,大家可以下载:
http://download.youkuaiyun.com/download/qq_34075012/10117428