python同程爬取之sql连接池,多线程锁

本文介绍了使用Python进行同程网数据爬取时,如何利用SQL连接池和多线程技术。通过配置数据库连接,封装线程池和SQL操作,按城市获取品牌ID,并进一步获取酒店ID。在多线程中,关键代码使用锁来保护共享资源,避免并发问题。同时强调了在多线程环境下局部变量的作用域和全局变量的谨慎使用,以确保爬虫的稳定运行。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

由于该项目还在运行当中,公司并未使用代理,此处只贴逻辑,关键代码我会打码,见谅

配置:

from DBUtils.PooledDB import PooledDB###sql连接池的一个包
import requests
import re
import time
import random
import json
import pymysql
from lxml import etree
import threading
b ={"user":"***",
    "passwd":"***",
    "host":"****",
    "db":'*****',
    "port":****,
    "charset":'utf8'####解决中文编码
}
h=0####全局参数,城市数,由于线程并发争抢资源的问题,故需锁住

数据库部分:###init游标,封装线程池,sql查询,sql增删改

class Braud(threading.Thread):###继承线程类
    __pool=None
    _instance_lock = threading.Lock()
    ##初始化一些资源
    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        ##接下来三个参数应为并没有处理记录线程情况作用不大
        self.threadID = threadID
        self.name = name
        self.counter = counter
        ###时间
        self.indata="2018-09-14"
        self.outdata="2018-09-15"
        #将数据库连接游标初始化,这个游标通过连接池中的连接每一次sql都是不同的连接
        self.coon=Braud.getmysqlconn()
        self.cur=self.coon.cursor(cursor=pymysql.cursors.DictCursor)

    #sql连接池,init游标,此处与游标建立了联系达到全局都可以使用这个池子,即每条sql语句通过游标建立一个单独的连接
    @staticmethod
    def getmysqlconn():
        if Braud.__pool is None:
	        #creater是以哪种包来创建pool,
	        #mincached最少的空闲连接数,如果空闲连接数小于这个数,pool会创建一个新的连接
	        #maxcached最大的空闲连接数,如果空闲连接数大于这个数,pool会关闭空闲连接
	        #当然这个包还有其他的东西:[精炼](https://www.cnblogs.com/Xjng/p/3437694.html)
            __pool=PooledDB(creator=pymysql,mincached=3,maxcached=20,host=b["host"],user=b["user"],passwd=b["passwd"],db=b["db"],port=b["port"],charset=b["charset"])
        return __pool.connection()

    # 插入\更新\删除sql\##此处有个问题就是传一个sql,value是不能直接生成sql语句,所以单独写对应写入函数来拼接sql语句
    def op_insert(self, sql):
        
        print('op_insert', sql)
        insert_num = self.cur.execute(sql)
        print("修改成功")
        self.coon.commit()
        return insert_num
       
    # 查询
    def op_select(self, sql):
        try:
            print('op_select', sql)
            self.cur.execute(sql)  # 执行sql
            select_res = self.cur.fetchall()  # 返回结果为字典
            print("查询成功")
            return select_res
        except pymysql.Error as e:
            self.op_select(sql)

    # 释放资源,此处我是加到所有线程结束,释放,并没有实测(开始是没加此函数)
    def dispose(self):
        self.coon.close()
        self.cur.close()

线程run方法,此方法名称不能改,为线程模块中默认跑线程start只是开启

#多线程:城市每10条一取最大633个
    def run(self):
        global h
        while True:
	        #在这个所得前后必须将终止条件都写一遍,不然就会有几个线程在达到终止条件但并不会停止
            if h >640:
                break
            with Booking._instance_lock:
                x = h
                if h>630:
                    x=633
                h += 10
            if h >640:
                break

            sql = "select name,id,en from ctrip_city_all_true ORDER BY id LIMIT {0},10".format(y)
            ress = self.op_select(sql)
            for res in ress:
                name=res['name']
                cityid=res['id']
                en=res['en']
                # print(name, cityid, en)
                self.braudid(name, cityid, en)

通过城市去找品牌的id,再通过

    def braudid(self,name,cityid,en):
        url_id=en+str(cityid)
        url="*********".format(url_id)
        header={
            "Host":"hotels.ctrip.com",
            "Connection":"keep-alive",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.89 Safari/537.36",
            "Content-Type":"application/x-www-form-urlencoded",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language":"zh-CN,zh;q=0.8",
        }
        data={
            # "NOVIEWSTATE":"%2FwEPDwULLTE5MjE3MzkxNzRkZBVcXgJ7yI9Svz9QuO1U86UTlYjTGcROZNdJwAiK3Son",
            # "__VIEWSTATEGENERATOR":"AEF82FC0",
            "hotelChoose":"on",
            # "CityName":"%E4%B8%8A%E6%B5%B7",
            # "cityId":"2",
            # "cityPY":"shanghai",
            "SSH_CheckIn":self.indata,
            # "HotelCityName":"%E4%B8%8A%E6%B5%B7",
            "checkIn":self.indata,
            "checkOut":self.outdata,
            "hotellevel":"0",
            "keywordNew":"",
            "RoomGuestCount":"1%2C1%2C0",
            "hasListForInitHotHotel":"",
            "_releaseNo_":"2018-09-05-02-02-00",
            "__action":"%2FDomestic%2FShowHotelList.aspx",
            "positionArea":"",
            "positionId":"",
            "keyword":"",
            "requestTravelMoney":"",
            "isfromindex":"1",
        }
        time.sleep(random.choice([2,3]))
        res = requests.post(url, headers=header, data=data)
        response = etree.HTML(res.text)
        #此处其实可以用我有一篇爬虫采坑的博客当中的一处用法:一个for循环加上枚举函数会在大量匹配情况下缩减代码的,当然看具体情况
        all_braudid=response.xpath("//div[@data-brandtype]/@data-value")
        all_braudname=response.xpath("//div[@data-brandtype]/text()")
        #zip函数建议自行百度。很好用
        for l,y in zip(all_braudid,all_braudname):
            self.detail(name,l,y,url_id,cityid,en)

通过页码获取品牌对应的所有酒店id

    def detail(self,name,l,y,url_id,cityid,en):

        """
        :pages:单个品牌酒店总数
        :x:单个品牌酒店的页码数
        :is_running:记录单品牌的页码是否跑完
        :return:
        """
        url="*******"
        header={

            "Host":"hotels.ctrip.com",
            "Connection":"keep-alive",
            # "Content-Length":"2070",
            "Cache-Control":"max-age=0",
            "Origin":"http://hotels.ctrip.com",
            # "If-Modified-Since":"Thu, 01 Jan 1970 00:00:00 GMT",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
            "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
            "Accept":"*/*",
            "Referer":"http://hotels.ctrip.com/hotel/{0}".format(url_id),
            # "Accept-Encoding":"gzip, deflate",
            "Accept-Language":"zh-CN,zh;q=0.8",
            # "Cookie":"_abtest_userid=fce06a44-b08c-4bfd-a608-48aeb1c95a56; Union=SID=155952&AllianceID=4897&OUID=baidu81|index|||; Session=SmartLinkCode=U155952&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=&SmartLinkLanguage=zh; traceExt=campaign=CHNbaidu81&adid=index; manualclose=1; HotelCityID=2split%E4%B8%8A%E6%B5%B7splitShanghaisplit2018-09-10split2018-09-11split0; ASP.NET_SessionId=to1zhofrettn2zt4mpad5lbk; OID_ForOnlineHotel=15365435022623io40i1536566681158102002; _RF1=58.216.216.82; _RSG=E7qlj5NbJ60lPuX_hOxb08; _RDG=286ed320a4997a2beb10590d3de6604848; _RGUID=01939571-6dbf-41ce-8025-ed8aff87866f; _ga=GA1.2.405546164.1536543507; _gid=GA1.2.1788700307.1536543507; Mkt_UnionRecord=%5B%7B%22aid%22%3A%224897%22%2C%22timestamp%22%3A1536566704126%7D%5D; __zpspc=9.3.1536566673.1536566704.3%231%7Cbaidu%7Ccpc%7Cbaidu81%7C%25E6%2590%25BA%25E7%25A8%258B%7C%23; _jzqco=%7C%7C%7C%7C1536543507350%7C1.1789717955.1536543507248.1536566683995.1536566704147.1536566683995.1536566704147.undefined.0.0.5.5; MKT_Pagesource=PC; _bfa=1.1536543502262.3io40i.1.1536543502262.1536566670788.2.9; _bfs=1.7; _bfi=p1%3D102002%26p2%3D102002%26v1%3D9%26v2%3D8",
        }
        pages = 1
        x = 1
        is_running=1
        while True:
            data = {
                "__VIEWSTATEGENERATOR":"DB1FBB6D",
                "cityName":name,
                "StartTime":self.indata,
                "DepTime":self.outdata,
                "RoomGuestCount":"1%2C1%2C0",
                "txtkeyword":"",
                "Resource":"",
                "Room":"",
                "Paymentterm":"",
                "BRev":"",
                "Minstate":"",
                "PromoteType":"",
                "PromoteDate":"",
                "operationtype":"NEWHOTELORDER",
                "PromoteStartDate":"",
                "PromoteEndDate":"",
                "OrderID":"",
                "RoomNum":"",
                "IsOnlyAirHotel":"F",
                "cityId":cityid,
                "cityPY":en,
                # "cityCode":"021",
                # "cityLat":"31.2363508011",
                # "cityLng":"121.4802384079",
                "positionArea":"",
                "positionId":"",
                "hotelposition":"",
                "keyword":"",
                "hotelId":"",
                "htlPageView":"0",
                "hotelType":"F",
                "hasPKGHotel":"F",
                "requestTravelMoney":"F",
                "isusergiftcard":"F",
                "useFG":"F",
                "HotelEquipment":"",
                "priceRange":"-2",
                "hotelBrandId":"",
                "promotion":"F",
                "prepay":"F",
                "IsCanReserve":"F",
                "OrderBy":"99",
                "OrderType":"",
                "k1":"",
                "k2":"",
                "CorpPayType":"",
                "viewType":"",
                # "checkIn":"2018-09-10",
                # "checkOut":"2018-09-11",
                "DealSale":"",
                "ulogin":"",
                "hidTestLat":"0%257C0",
                # "AllHotelIds":"428365%252C16197084%252C8052290%252C12603011%252C18431318%252C419612%252C6081776%252C3792196%252C4399431%252C441585%252C8056609%252C17284331%252C436187%252C473770%252C4119594%252C1498006%252C6338488%252C1496646%252C436581%252C2895314%252C12782071%252C397163%252C19454410%252C1064521%252C15018773&psid=&isfromlist=T&ubt_price_key=htl_search_result_promotion&showwindow=&defaultcoupon=&isHuaZhu=False&hotelPriceLow=&htlFrom=hotellist&unBookHotelTraceCode=&showTipFlg=&traceAdContextId=v2_H4sIAAAAAAAAAB3IsQ3CMBAFUNMxAyWiQrH0v3M5n1NQsIgF9lEzCCXLZAKWYBiEX%252Fn22%252Bf1%252Fk6HSyKy1fZsVViUqXJNU545yrCoooyjJF3GUooB%252Bt9jOJ9udLt7R3R9tCgCRKMzqrHnxk6fM3ZruIYf1Q0fxXUAAAA%253D&allianceid=0&sid=0&hotelIds=428365_1_1%2C16197084_2_1%2C8052290_3_1%2C12603011_4_1%2C18431318_5_1%2C419612_6_1%2C6081776_7_1%2C3792196_8_1%2C4399431_9_1%2C441585_10_1%2C8056609_11_1%2C17284331_12_1%2C436187_13_1%2C473770_14_1%2C4119594_15_1%2C1498006_16_1%2C6338488_17_1%2C1496646_18_1%2C436581_19_1%2C2895314_20_1%2C12782071_21_1%2C397163_22_1%2C19454410_23_1%2C1064521_24_1%2C15018773_25_1",
                "markType":"0",
                "zone":"",
                "location":"",
                "feature":"",
                "equip":"",
                "bed":"",
                "breakfast":"",
                "other":"",
                "star":"",
                "sl":"",
                "s":"",
                "l":"",
                "price":"",
                "a":"0",
                "keywordLat":"",
                "keywordLon":"",
                "contrast":"0",
                "PaymentType":"",
                "CtripService":"",
                "promotionf":"",
                "attachDistance":"0",
                "contyped":"0",
                "productcode":"",
                "group":"",
                "type":"",
                "brand":l,
                "page":x,
            }
            time.sleep(random.choice([2,3]))
            res=requests.post(url,headers=header,data=data)
            red=json.loads(res.text)
            hotels=red.get("hotelPositionJSON")
            for hotel in hotels:
                # print(hotels)
                ctripid=hotel.get('id',"0")
                # print(ctripid,l,y,url_id)
                self.braud_insert(ctripid,l,y)

            page = red.get("hotelAmount")
            # 计算出页码,参数i大于计算页码数终止循环
            if page and page > 25:
                pages = page // 25
                page_add = page % 25
                if page_add > 0:
                    pages = pages + 1
            x += 1
            if x > pages:
                is_running = 2
                break
        x=x-1
        self.braud_run_insert(en,y,page,is_running,x)

拼接sql语句

	#此函数记录爬虫状态包括跑了哪个城市。多少页,未执行还是执行完毕
    def braud_run_insert(self,en,y,page,is_running,x):
        sql = "replace into cn_hotel_ctrip_braud_run_state (cityid,braud,pages,is_running,run_page) VALUES ('%s','%s',%s,%s,%s)"%(en,y,page,is_running,x)
        self.op_insert(sql)
    #此函数为写入数据
    def braud_insert(self,ctripid,l,y):

        sql = "replace into cn_hotel_ctrip_braud (ctripid,braudid,braudname) VALUES (%s,%s,'%s')"%(ctripid,l,y)
        self.op_insert(sql)

主函数

if __name__=="__main__":
	#此处没有使用queue是为了简单
    threaList = []
    for i in range(4):
        thread="thread-%s"%i
        threadlist=Braud(i,thread,i)
        threaList.append(threadlist)
    for i in threaList:
        i.start()
    #阻塞这一块了是必须加的不让很容造成线程在共享资源处出问题
    for i in threaList:
        i.join()
    ##这一段说句实话未测试应该是在所有线程结束释放sql资源
    sp=Braud()
    sp.dispose()

总结:其实这篇代码了,还有许多漏洞和低阶写法,但是目前这种状态下,已经足够使用了。最后还有一小处要解释的,在线程当中一般锁,只锁住关键(共享资源)较少行数的代码,不然和单线程并无太大区别。而且了在每个单条线程当中的局部变量是并不会相互争抢的,毕竟是有作用域的。全局参数就不行了,很容易造成重复读取,所以一般全局变量慎用

有错误请及时纠正

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值