随着互联网信息大爆炸,网络爬虫应该是如今很多数据公司都会用到的技术,目前主流的技术有很多,例如:JAVA、Groovy、Python、Nutch等,太多各种各样的技术都可以实现。本人主要使用过JAVA、Groovy做过网络爬虫。对于有些小伙伴可能没有接触过Groovy,我在这里简单解释一下,Groovy是一种基于JVM(Java虚拟机)的动态脚本语言,详细的解释大家可以在百度百科搜索。
下面是我曾在任职公司做的某网站小区均价数据抓取,采用的是Groovy技术。这里只展示相关核心技术,部分封装代码就不展示了,这里已做敏感信息处理了!
package com.crawl
import com.crawl.httpclient.UserAgents
import com.crawl.script.BaseBatch
import com.crawl.script.BatchOwner
import com.crawl.script.Watcher
import com.crawl.script.thirdPriceCrawl.CharsUtils
import com.crawl.script.thirdPriceCrawl.Utils
import com.crawl.script.util.ABuYunUtil
import com.crawl.script.util.HttpClient
import com.crawl.script.util.MongoUtils
import com.mongodb.BasicDBObject
import com.mongodb.DBCollection
import com.mongodb.DBObject
import net.sf.json.JSONObject
import org.apache.commons.lang.StringUtils
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicInteger
/**
* 网络爬虫:
* 某网站-小区均价数据抓取<br>
* http://www.xxx.com/
*
* @author 小辉哥/小辉GE
* <p>
* 2019年8月4日 下午18:22:12
*/
@BatchOwner(Watcher.ryh)
class FJCrawl extends BaseBatch {
//线程池
def static final GLOBAL_TRY_TIMES = 100
def static ExecutorService fixedThreadPool
//变量
def static final GLOBAL_REQUEST_GET = "GET"
def static final GLOBAL_REQUEST_POST = "POST"
def static final GLOBAL_WEBSITE_SOURCE = '某网站'
def static final LOGIN_URL = "http://www.xxx.com/login"
def static final COMMUNITY_SEARCH_URL = "http://xxx.com/addSearch/dataGet"
def static final COMMUNITY_COMID_GUID_URL = 'http://xxx.com/addSearch/getComId'
def static final COMMUNITY_JICHU_DETAILS_URL = 'http://xxx.com/jrgz/getXQJCXX'
def static final S_DATE = Utils.getLastMonth()
// 全局统计抓取次数
def static final AtomicInteger count = new AtomicInteger(0)
// DBCollection
def static final DBCollection thirdPartyColl = MongoUtils.getMongoDBByAuth().getCollection("third_party_price")
//注册用户列表, 某网站限制一个用户只能查询一个城市
def static final accountUsers = [
["username": "jenis", "password": "xxxxxxxx", "rememberme": "0", "city": "上海", "mobile": 11111111111],
["username": "ryhjenis", "password": "xxxxxxxx", "rememberme": "0", "city": "北京", "mobile": 22222222222],
// 这里还有很多用户列表,为了大家可读,已删除
["username": "ryh", "password": "xxxxxxxx", "rememberme": "0", "city": "广州", "mobile": 33333333333]
]
static main(args) {
runBatch(new FJCrawl(), args)
}
@Override
void process(Object args) {
crawlDataSaveThirdPrice()
}
/**
* 每次执行时候初始化线程池, 防止类已加载就要多个线程池中线程占用内存
* 调到30以上很多登录请求code都为407或者503,应该是某网站有控制
* @param type
* @return
*/
def static initThreadPool() {
fixedThreadPool = Executors.newFixedThreadPool(30)
}
//小区详情抓取+生成三方均价
//测试发现同一个代理ip出去的请求,所有的header都一样,不同用户登陆后生成的session还是不一样的,
//所以登录可以用多线程, 不会存在把其他用户登录session挤掉的情况
//备注:线程池不能放在communitySearchAndSave方法里面,测试发现主线程each 和 异步线程是同时执行,会存在执行到后面城市发现会话已经过期的情况
def static crawlDataSaveThirdPrice() {
if (!(accountUsers.size() == 0)) {
initThreadPool()
println("crawlDataSaveThirdPrice 抓取小区详情 + 保存三方均价开始")
long start = System.currentTimeMillis()
accountUsers.each { data ->
fixedThreadPool.execute(
new Runnable() {
void run() {
try {
def responseMap = goLogin(data.username, data.password, data.rememberme)
def cookieAuth = goApiLogin(responseMap)
communitySearchAndSave(cookieAuth, data.city)
} catch (Exception e) {
println "用户" + data + "抓取数据时发生异常, 异常信息为:" + e.getMessage()
} finally {
}
}
}
)
}
// 不在接收线程处理任务
fixedThreadPool.shutdown()
// 最长等待时间5天
fixedThreadPool.awaitTermination(5, TimeUnit.DAYS)
long end = System.currentTimeMillis()
println "crawlDataSaveThirdPrice 抓取小区详情 + 保存三方均价完成,耗时:" + (end - start)
}
}
/**
* 登录方法
* @param username
* @param password
* @param rememberme
* @return
*/
def static goLogin(username, password, rememberme) {
def responseMap = [:]
try {
//headers信息
Map<String, String> headers = new LinkedHashMap<String, String>()
headers.put("Accept", "application/json, text/javascript, */*; q=0.01")
headers.put("Accept-Encoding", "gzip, deflate")
headers.put("Accept-Language", "zh-CN,zh;q=0.9")
headers.put("User-Agent", UserAgents.get())
headers.put("X-Requested-With", "XMLHttpRequest")
headers.put("Referer", "http://www.xxx.com/")
headers.put("Origin", "http://www.xxx.com")
headers.put("Proxy-Connection", "keep-alive")
//params信息
Map<String, String> params = new LinkedHashMap<String, String>()
params.put("pwd_login_username", username)
params.put("pwd_login_password", password)
params.put("remembermeVal", rememberme)
//post请求
responseMap = isProxyAndTryTimesToRequest(LOGIN_URL, HttpClient.CHARSET, GLOBAL_TRY_TIMES, GLOBAL_REQUEST_POST, params, headers, false)
} catch (Exception e) {
println "goLogin(${username}, ${password}, ${rememberme})发生异常, 异常信息为:" + e.getMessage()
}
return responseMap
}
/**
* 再次登录方法, 获取登录后的Set-Cookie:中SESSION
* @param response
* @return
*/
def static goApiLogin(responseMap) {
def cookieAuth
try {
def response = responseMap.get("response");
if (StringUtils.isNotEmpty(response)) {
JSONObject reJson = JSONObject.fromObject(response)
//登录成功获取COOKIE
if (reJson.success) {
responseMap.get("headerFields").each { k, v ->
if (k && k.equals("Set-Cookie")) {
if (v) {
cookieAuth = v.join(';')
}
}
}
}
}
} catch (Exception e) {
println "goApiLogin(${responseMap})发生异常, 异常信息为:" + e.getMessage()
}
println("最终构造cookieAuth为:" + cookieAuth)
return cookieAuth
}
/**
* 匹配搜索
* 根据现有的字符集模拟搜索小区
* @param cookieAuth
* @param city
* @return
*/
def static communitySearchAndSave(cookieAuth, city) {
try {
if (StringUtils.isNotEmpty(cookieAuth)) {
//headers信息
Map<String, String> headers = new LinkedHashMap<String, String>()
headers.put("Accept", "application/json, text/javascript, */*; q=0.01")
headers.put("Accept-Encoding", "gzip, deflate")
headers.put("Accept-Language", "zh-CN,zh;q=0.9")
headers.put("User-Agent", UserAgents.get())
headers.put("X-Requested-With", "XMLHttpRequest")
headers.put("Proxy-Connection", "keep-alive")
headers.put("Cookie", cookieAuth)
//动态构造匹配字符
CharsUtils.charsCollect.each { chars ->
try {
//构造匹配搜索url
def url = COMMUNITY_SEARCH_URL + '?q=' + URLEncoder.encode(chars, "utf-8") +
'&limit=150' + '×tamp=' + System.currentTimeMillis() + '&userInput=' + URLEncoder.encode(chars, "utf-8") +
'&cityName=' + URLEncoder.encode(city, "utf-8") + '&accurateType=1&matchType=1'
//get请求
def responseMap = isProxyAndTryTimesToRequest(url, HttpClient.CHARSET, GLOBAL_TRY_TIMES, GLOBAL_REQUEST_GET, null, headers, false)
//解析请求结果, 获取小区name, 地区, 小区id
def response = responseMap.get("response")
if (StringUtils.isNotEmpty(response)) {
JSONObject reJson = JSONObject.fromObject(response)
if (reJson.success) {
println "城市" + city + ", 当前匹配字符为:“" + chars + "”, 搜索获取小区列表成功, 响应JSON内容为:" + reJson
def residentialList = reJson.data.residentialList
if (residentialList != null && residentialList.size() > 0) {
residentialList.each { data ->
try {
println "匹配搜索解析遍历当前详情为:城市" + city + ", 地区" + data.districtName + ", 小区" + data.residentialName + ", communityId为" + data.communityId
//调用获取小区详情方法
getCommunityComIdByGuid(cookieAuth, city, data.districtName, data.residentialName, data.communityId)
} catch (Exception e) {
println "匹配搜索解析遍历当前详情为:城市" + city + ", 地区" + data.districtName + ", 小区" + data.residentialName + ", 发生异常, 异常信息为:" + e.getMessage()
}
}
}
}
}
} catch (Exception e) {
println "城市" + city + ", 匹配字符“" + chars + "”, 搜索获取小区列表时, 发生异常, 异常信息为:" + e.getMessage()
}
}
}
} catch (Exception e) {
println "communitySearchAndSave(${city}, ${cookieAuth})发生异常, 异常信息为:" + e.getMessage()
}
}
/**
* 获取小区的COMID
* @param cookieAuth
* @param city
* @param districtName
* @param name
* @param communityId
* @return
*/
def static getCommunityComIdByGuid(cookieAuth, city, districtName, name, communityId) {
try {
//构造获取小区COMID的url
def url = COMMUNITY_COMID_GUID_URL + '?cityName=' + URLEncoder.encode(city, "utf-8") + '&guid=' + communityId
//headers信息
Map<String, String> headers = new LinkedHashMap<String, String>()
headers.put("Accept", "application/json, text/javascript, */*; q=0.01")
headers.put("Accept-Encoding", "gzip, deflate")
headers.put("Accept-Language", "zh-CN,zh;q=0.9")
headers.put("User-Agent", UserAgents.get())
headers.put("X-Requested-With", "XMLHttpRequest")
headers.put("Proxy-Connection", "keep-alive")
headers.put("Cookie", cookieAuth);
//get请求
def responseMap = isProxyAndTryTimesToRequest(url, HttpClient.CHARSET, GLOBAL_TRY_TIMES, GLOBAL_REQUEST_GET, null, headers, false)
//解析获取小区COMID结果
def response = responseMap.get("response")
if (StringUtils.isNotEmpty(response)) {
JSONObject reJson = JSONObject.fromObject(response)
if (reJson.success) {
println "getCommunityComIdByGuid(${city}, ${districtName}, ${name}, ${communityId}, ${cookieAuth})获取小区COMID成功, 响应JSON内容为:" + reJson
def comId = reJson.data.comId
getCommunityDetails(cookieAuth, city, districtName, name, communityId, comId)
}
}
} catch (Exception e) {
println "getCommunityComIdByGuid(${city}, ${districtName}, ${name}, ${communityId}, ${cookieAuth})发生异常, 异常信息为:" + e.getMessage()
}
}
/**
* 获取小区详情
* 未能有小区详情结果
* @param cookieAuth
* @param city
* @param districtName
* @param name
* @param communityId
* @param comId
* @return
*/
def static getCommunityDetails(cookieAuth, city, districtName, name, communityId, comId) {
try {
//构造获取小区COMID的url
def url = COMMUNITY_JICHU_DETAILS_URL + '?CityName=' + URLEncoder.encode(city, "utf-8") + '&ResidentialAreaID=' + comId
//headers信息
Map<String, String> headers = new LinkedHashMap<String, String>()
headers.put("Accept", "application/json, text/javascript, */*; q=0.01")
headers.put("Accept-Encoding", "gzip, deflate")
headers.put("Accept-Language", "zh-CN,zh;q=0.9")
headers.put("User-Agent", UserAgents.get())
headers.put("X-Requested-With", "XMLHttpRequest")
headers.put("Proxy-Connection", "keep-alive")
headers.put("Cookie", cookieAuth);
//post请求
def responseMap = isProxyAndTryTimesToRequest(url, HttpClient.CHARSET, GLOBAL_TRY_TIMES, GLOBAL_REQUEST_POST, null, headers, false)
//解析请求小区详情结果
def response = responseMap.get("response")
if (StringUtils.isNotEmpty(response) && !response.equals("{}")) {
JSONObject reJson = JSONObject.fromObject(response)
println "getCommunityDetails(${city}, ${districtName} ,${name}, ${communityId}, ${comId}, ${cookieAuth})获取小区详情成功, 响应JSON内容为:" + reJson
//此处某网站改版后, 没有返回data和success, 无法用该判断标准(暂无判断标准), 直接调用解析方法
fjParser(url, city, districtName, name, communityId, reJson)
} else {
println "getCommunityDetails(${city}, ${districtName} ,${name}, ${communityId}, ${comId}, ${cookieAuth})获取小区详情为空或者为'{}'"
}
} catch (Exception e) {
println "getCommunityDetails(${city}, ${districtName} ,${name}, ${communityId}, ${comId}, ${cookieAuth})发生异常, 异常信息为:" + e.getMessage()
}
}
/**
* 某网站解析方法
* @param url
* @param city
* @param districtName
* @param name
* @param communityId
* @param reJson
* @return
*/
def static fjParser(url, city, districtName, name, communityId, reJson) {
try {
if (reJson != null) {
def region
def avg_price
//如果reJson存在地区, 则用, 如果无前一次页面的districtName
if (StringUtils.isNotEmpty(reJson.DistrictName)) {
region = reJson.DistrictName
} else {
region = districtName
}
//目前抓取某网站的js, 发现价格直接显示, 但是有可能为暂无, 只是目前无法发现该类数据,
//暂时做不了逻辑处理, 为保证数据正常保存, 此处加上try/catch
try {
avg_price = (int) Double.parseDouble(String.valueOf(reJson.UnitPrice))
} catch (Exception e) {
avg_price = 0
}
DBObject obj = constructThirdPartyPriceInfoData(url, city, region, "", name, "", communityId, avg_price, S_DATE, GLOBAL_WEBSITE_SOURCE, new Date())
saveParserResult(obj)
}
} catch (Exception e) {
println "fangGuGuParser(${url}, ${city}, ${districtName}, ${name}, ${communityId}, ${reJson})发生异常, 异常信息为:" + e.getMessage()
saveAnalyLogs(url, GLOBAL_WEBSITE_SOURCE, e.getMessage())
}
}
/**
* 通用方法
* 构造三方均价表数据
* @param url
* @param city
* @param region
* @param block
* @param name
* @param address
* @param district_id
* @param avg_price
* @param sdate
* @param source
* @param cdate
* @return
*/
def static constructThirdPartyPriceInfoData(url, city, region, block, name, address, district_id, avg_price, sdate, source, cdate) {
DBObject obj = new BasicDBObject();
obj.put("url", url)
obj.put("city", city)
obj.put("region", region)
obj.put("block", block)
obj.put("name", name)
obj.put("address", address)
obj.put("district_id", district_id)
obj.put("avg_price", avg_price)
obj.put("s_date", sdate)
obj.put("source", source)
obj.put("c_date", cdate)
return obj
}
/**
* 某网站解析结果到third_party_price表
* @param data
* @return
*/
def static saveParserResult(data) {
//查询是否存在的唯一条件
DBObject query = new BasicDBObject();
query.put("s_date", data.s_date)
query.put("city", data.city)
query.put("region", data.region)
query.put("url", data.url)
query.put("source", data.source)
query.put("name", data.name)
//需要新增或修改的对象
def update = new BasicDBObject()
update.append('$set', data)
thirdPartyColl.update(query, update, true, false)
println('保存三方均价, 存入Mongo DB 数据 data为:' + data.toString())
}
/**
* 针对GET/POST请求代理方法, while里面加上try/catch, 防止请求异常城市列表加载直接抛出执行完毕
* 此方法考虑到网站抓取可能遇到的问题,做了重试机制的封装
* @param url
* @param charset
* @param tryTimes
* @param requestType
* @param params
* @param headers
* @param isJson
* @return
*/
def static isProxyAndTryTimesToRequest(url, charset, tryTimes, requestType, params, headers, isJson) {
def responseMap = [:]
def _tryTimes_ = 0
//请求响应为空(切换代理重试)
while (_tryTimes_ < tryTimes) {
try {
//第一次请求不提示重试
println("请求地址URL为:" + url + "代理尝试次数tryTimes为" + (_tryTimes_ + 1))
if (StringUtils.isNotEmpty(requestType) && requestType.equals(GLOBAL_REQUEST_POST)) {
responseMap = ABuYunUtil.doPostResponse(url, params, headers, charset, isJson)
} else {
responseMap = ABuYunUtil.doGetResponse(url, params, headers, charset)
}
def response = responseMap.get("response")
if (StringUtils.isEmpty(response)) {
println('isProxyAndTryTimesToRequest 请求response为空')
//控制sleep时间
//Thread.sleep(sleepTimes);
} else {
//控制sleep时间
//Thread.sleep(sleepTimes);
println('isProxyAndTryTimesToRequest 请求response响应成功, 响应内容为:' + response)
break
}
} catch (Exception e) {
println("isProxyAndTryTimesToRequest请求发生异常, 请求url为: " + url + "请求异常信息为:" + e.getMessage())
} finally {
//统计所有请求总次数(后期统计用)
println("目前整个" + GLOBAL_WEBSITE_SOURCE + "抓取请求总次数count为: " + count.incrementAndGet())
_tryTimes_++
}
}
responseMap.put("tryTimes", _tryTimes_)
responseMap.put("url", url)
return responseMap
}
}
输出结果在这里就不展示了!!!
以上代码仅供参考,如有不当之处,欢迎指出!!!
更多干货,欢迎大家关注和联系我。期待和大家一起更好的交流、探讨技术!!!