由于该项目还在运行当中,公司并未使用代理,此处只贴逻辑,关键代码我会打码,见谅
配置:
from DBUtils.PooledDB import PooledDB###sql连接池的一个包
import requests
import re
import time
import random
import json
import pymysql
from lxml import etree
import threading
b ={"user":"***",
"passwd":"***",
"host":"****",
"db":'*****',
"port":****,
"charset":'utf8'####解决中文编码
}
h=0####全局参数,城市数,由于线程并发争抢资源的问题,故需锁住
数据库部分:###init游标,封装线程池,sql查询,sql增删改
class Braud(threading.Thread):###继承线程类
__pool=None
_instance_lock = threading.Lock()
##初始化一些资源
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
##接下来三个参数应为并没有处理记录线程情况作用不大
self.threadID = threadID
self.name = name
self.counter = counter
###时间
self.indata="2018-09-14"
self.outdata="2018-09-15"
#将数据库连接游标初始化,这个游标通过连接池中的连接每一次sql都是不同的连接
self.coon=Braud.getmysqlconn()
self.cur=self.coon.cursor(cursor=pymysql.cursors.DictCursor)
#sql连接池,init游标,此处与游标建立了联系达到全局都可以使用这个池子,即每条sql语句通过游标建立一个单独的连接
@staticmethod
def getmysqlconn():
if Braud.__pool is None:
#creater是以哪种包来创建pool,
#mincached最少的空闲连接数,如果空闲连接数小于这个数,pool会创建一个新的连接
#maxcached最大的空闲连接数,如果空闲连接数大于这个数,pool会关闭空闲连接
#当然这个包还有其他的东西:[精炼](https://www.cnblogs.com/Xjng/p/3437694.html)
__pool=PooledDB(creator=pymysql,mincached=3,maxcached=20,host=b["host"],user=b["user"],passwd=b["passwd"],db=b["db"],port=b["port"],charset=b["charset"])
return __pool.connection()
# 插入\更新\删除sql\##此处有个问题就是传一个sql,value是不能直接生成sql语句,所以单独写对应写入函数来拼接sql语句
def op_insert(self, sql):
print('op_insert', sql)
insert_num = self.cur.execute(sql)
print("修改成功")
self.coon.commit()
return insert_num
# 查询
def op_select(self, sql):
try:
print('op_select', sql)
self.cur.execute(sql) # 执行sql
select_res = self.cur.fetchall() # 返回结果为字典
print("查询成功")
return select_res
except pymysql.Error as e:
self.op_select(sql)
# 释放资源,此处我是加到所有线程结束,释放,并没有实测(开始是没加此函数)
def dispose(self):
self.coon.close()
self.cur.close()
线程run方法,此方法名称不能改,为线程模块中默认跑线程start只是开启
#多线程:城市每10条一取最大633个
def run(self):
global h
while True:
#在这个所得前后必须将终止条件都写一遍,不然就会有几个线程在达到终止条件但并不会停止
if h >640:
break
with Booking._instance_lock:
x = h
if h>630:
x=633
h += 10
if h >640:
break
sql = "select name,id,en from ctrip_city_all_true ORDER BY id LIMIT {0},10".format(y)
ress = self.op_select(sql)
for res in ress:
name=res['name']
cityid=res['id']
en=res['en']
# print(name, cityid, en)
self.braudid(name, cityid, en)
通过城市去找品牌的id,再通过
def braudid(self,name,cityid,en):
url_id=en+str(cityid)
url="*********".format(url_id)
header={
"Host":"hotels.ctrip.com",
"Connection":"keep-alive",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language":"zh-CN,zh;q=0.8",
}
data={
# "NOVIEWSTATE":"%2FwEPDwULLTE5MjE3MzkxNzRkZBVcXgJ7yI9Svz9QuO1U86UTlYjTGcROZNdJwAiK3Son",
# "__VIEWSTATEGENERATOR":"AEF82FC0",
"hotelChoose":"on",
# "CityName":"%E4%B8%8A%E6%B5%B7",
# "cityId":"2",
# "cityPY":"shanghai",
"SSH_CheckIn":self.indata,
# "HotelCityName":"%E4%B8%8A%E6%B5%B7",
"checkIn":self.indata,
"checkOut":self.outdata,
"hotellevel":"0",
"keywordNew":"",
"RoomGuestCount":"1%2C1%2C0",
"hasListForInitHotHotel":"",
"_releaseNo_":"2018-09-05-02-02-00",
"__action":"%2FDomestic%2FShowHotelList.aspx",
"positionArea":"",
"positionId":"",
"keyword":"",
"requestTravelMoney":"",
"isfromindex":"1",
}
time.sleep(random.choice([2,3]))
res = requests.post(url, headers=header, data=data)
response = etree.HTML(res.text)
#此处其实可以用我有一篇爬虫采坑的博客当中的一处用法:一个for循环加上枚举函数会在大量匹配情况下缩减代码的,当然看具体情况
all_braudid=response.xpath("//div[@data-brandtype]/@data-value")
all_braudname=response.xpath("//div[@data-brandtype]/text()")
#zip函数建议自行百度。很好用
for l,y in zip(all_braudid,all_braudname):
self.detail(name,l,y,url_id,cityid,en)
通过页码获取品牌对应的所有酒店id
def detail(self,name,l,y,url_id,cityid,en):
"""
:pages:单个品牌酒店总数
:x:单个品牌酒店的页码数
:is_running:记录单品牌的页码是否跑完
:return:
"""
url="*******"
header={
"Host":"hotels.ctrip.com",
"Connection":"keep-alive",
# "Content-Length":"2070",
"Cache-Control":"max-age=0",
"Origin":"http://hotels.ctrip.com",
# "If-Modified-Since":"Thu, 01 Jan 1970 00:00:00 GMT",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"Accept":"*/*",
"Referer":"http://hotels.ctrip.com/hotel/{0}".format(url_id),
# "Accept-Encoding":"gzip, deflate",
"Accept-Language":"zh-CN,zh;q=0.8",
# "Cookie":"_abtest_userid=fce06a44-b08c-4bfd-a608-48aeb1c95a56; Union=SID=155952&AllianceID=4897&OUID=baidu81|index|||; Session=SmartLinkCode=U155952&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; traceExt=campaign=CHNbaidu81&adid=index; manualclose=1; HotelCityID=2split%E4%B8%8A%E6%B5%B7splitShanghaisplit2018-09-10split2018-09-11split0; ASP.NET_SessionId=to1zhofrettn2zt4mpad5lbk; OID_ForOnlineHotel=15365435022623io40i1536566681158102002; _RF1=58.216.216.82; _RSG=E7qlj5NbJ60lPuX_hOxb08; _RDG=286ed320a4997a2beb10590d3de6604848; _RGUID=01939571-6dbf-41ce-8025-ed8aff87866f; _ga=GA1.2.405546164.1536543507; _gid=GA1.2.1788700307.1536543507; Mkt_UnionRecord=%5B%7B%22aid%22%3A%224897%22%2C%22timestamp%22%3A1536566704126%7D%5D; __zpspc=9.3.1536566673.1536566704.3%231%7Cbaidu%7Ccpc%7Cbaidu81%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; _jzqco=%7C%7C%7C%7C1536543507350%7C1.1789717955.1536543507248.1536566683995.1536566704147.1536566683995.1536566704147.undefined.0.0.5.5; MKT_Pagesource=PC; _bfa=1.1536543502262.3io40i.1.1536543502262.1536566670788.2.9; _bfs=1.7; _bfi=p1%3D102002%26p2%3D102002%26v1%3D9%26v2%3D8",
}
pages = 1
x = 1
is_running=1
while True:
data = {
"__VIEWSTATEGENERATOR":"DB1FBB6D",
"cityName":name,
"StartTime":self.indata,
"DepTime":self.outdata,
"RoomGuestCount":"1%2C1%2C0",
"txtkeyword":"",
"Resource":"",
"Room":"",
"Paymentterm":"",
"BRev":"",
"Minstate":"",
"PromoteType":"",
"PromoteDate":"",
"operationtype":"NEWHOTELORDER",
"PromoteStartDate":"",
"PromoteEndDate":"",
"OrderID":"",
"RoomNum":"",
"IsOnlyAirHotel":"F",
"cityId":cityid,
"cityPY":en,
# "cityCode":"021",
# "cityLat":"31.2363508011",
# "cityLng":"121.4802384079",
"positionArea":"",
"positionId":"",
"hotelposition":"",
"keyword":"",
"hotelId":"",
"htlPageView":"0",
"hotelType":"F",
"hasPKGHotel":"F",
"requestTravelMoney":"F",
"isusergiftcard":"F",
"useFG":"F",
"HotelEquipment":"",
"priceRange":"-2",
"hotelBrandId":"",
"promotion":"F",
"prepay":"F",
"IsCanReserve":"F",
"OrderBy":"99",
"OrderType":"",
"k1":"",
"k2":"",
"CorpPayType":"",
"viewType":"",
# "checkIn":"2018-09-10",
# "checkOut":"2018-09-11",
"DealSale":"",
"ulogin":"",
"hidTestLat":"0%257C0",
# "AllHotelIds":"428365%252C16197084%252C8052290%252C12603011%252C18431318%252C419612%252C6081776%252C3792196%252C4399431%252C441585%252C8056609%252C17284331%252C436187%252C473770%252C4119594%252C1498006%252C6338488%252C1496646%252C436581%252C2895314%252C12782071%252C397163%252C19454410%252C1064521%252C15018773&psid=&isfromlist=T&ubt_price_key=htl_search_result_promotion&showwindow=&defaultcoupon=&isHuaZhu=False&hotelPriceLow=&htlFrom=hotellist&unBookHotelTraceCode=&showTipFlg=&traceAdContextId=v2_H4sIAAAAAAAAAB3IsQ3CMBAFUNMxAyWiQrH0v3M5n1NQsIgF9lEzCCXLZAKWYBiEX%252Fn22%252Bf1%252Fk6HSyKy1fZsVViUqXJNU545yrCoooyjJF3GUooB%252Bt9jOJ9udLt7R3R9tCgCRKMzqrHnxk6fM3ZruIYf1Q0fxXUAAAA%253D&allianceid=0&sid=0&hotelIds=428365_1_1%2C16197084_2_1%2C8052290_3_1%2C12603011_4_1%2C18431318_5_1%2C419612_6_1%2C6081776_7_1%2C3792196_8_1%2C4399431_9_1%2C441585_10_1%2C8056609_11_1%2C17284331_12_1%2C436187_13_1%2C473770_14_1%2C4119594_15_1%2C1498006_16_1%2C6338488_17_1%2C1496646_18_1%2C436581_19_1%2C2895314_20_1%2C12782071_21_1%2C397163_22_1%2C19454410_23_1%2C1064521_24_1%2C15018773_25_1",
"markType":"0",
"zone":"",
"location":"",
"feature":"",
"equip":"",
"bed":"",
"breakfast":"",
"other":"",
"star":"",
"sl":"",
"s":"",
"l":"",
"price":"",
"a":"0",
"keywordLat":"",
"keywordLon":"",
"contrast":"0",
"PaymentType":"",
"CtripService":"",
"promotionf":"",
"attachDistance":"0",
"contyped":"0",
"productcode":"",
"group":"",
"type":"",
"brand":l,
"page":x,
}
time.sleep(random.choice([2,3]))
res=requests.post(url,headers=header,data=data)
red=json.loads(res.text)
hotels=red.get("hotelPositionJSON")
for hotel in hotels:
# print(hotels)
ctripid=hotel.get('id',"0")
# print(ctripid,l,y,url_id)
self.braud_insert(ctripid,l,y)
page = red.get("hotelAmount")
# 计算出页码,参数i大于计算页码数终止循环
if page and page > 25:
pages = page // 25
page_add = page % 25
if page_add > 0:
pages = pages + 1
x += 1
if x > pages:
is_running = 2
break
x=x-1
self.braud_run_insert(en,y,page,is_running,x)
拼接sql语句
#此函数记录爬虫状态包括跑了哪个城市。多少页,未执行还是执行完毕
def braud_run_insert(self,en,y,page,is_running,x):
sql = "replace into cn_hotel_ctrip_braud_run_state (cityid,braud,pages,is_running,run_page) VALUES ('%s','%s',%s,%s,%s)"%(en,y,page,is_running,x)
self.op_insert(sql)
#此函数为写入数据
def braud_insert(self,ctripid,l,y):
sql = "replace into cn_hotel_ctrip_braud (ctripid,braudid,braudname) VALUES (%s,%s,'%s')"%(ctripid,l,y)
self.op_insert(sql)
主函数
if __name__=="__main__":
#此处没有使用queue是为了简单
threaList = []
for i in range(4):
thread="thread-%s"%i
threadlist=Braud(i,thread,i)
threaList.append(threadlist)
for i in threaList:
i.start()
#阻塞这一块了是必须加的不让很容造成线程在共享资源处出问题
for i in threaList:
i.join()
##这一段说句实话未测试应该是在所有线程结束释放sql资源
sp=Braud()
sp.dispose()
总结:其实这篇代码了,还有许多漏洞和低阶写法,但是目前这种状态下,已经足够使用了。最后还有一小处要解释的,在线程当中一般锁,只锁住关键(共享资源)较少行数的代码,不然和单线程并无太大区别。而且了在每个单条线程当中的局部变量是并不会相互争抢的,毕竟是有作用域的。全局参数就不行了,很容易造成重复读取,所以一般全局变量慎用
有错误请及时纠正