多线程爬取百思不得姐的内涵段子,并将内容保存到csv文件中

import requests
import csv
from lxml import etree
import threading
from queue import Queue
flock = threading.Lock()#创建一个锁对象
items = []
class Save_data(threading.Thread):
    def __init__(self,url_queue,data_queue,flock):
        super().__init__()  # 必须得先调用父类方法
        self.url_queue = url_queue
        self.data_queue = data_queue
        self.flock = flock
    def run(self):
        while True:
            if self.url_queue.empty() and self.data_queue.empty():  # 待爬取的url为空时,终止爬取
                break
            items = self.data_queue.get()
            self.save_data(items)
    def save_data(self,items):
        title = ["作者","内容"]
        self.flock.acquire()#写入文件的时候上锁
        with open('duanzi1.csv','w',encoding='utf8',newline='')as fp:
            writer = csv.DictWriter(fp,title)#写入字典的方式
            #写入表头数据的时候,需要调用writeheader方法
            writer.writeheader()#写完文件释放锁
            writer.writerows(items)
        self.flock.release()#
class Parse_url(threading.Thread):
    def __init__(self,url_queue,data_queue):
        super().__init__()  # 必须得先调用父类方法
        self.url_queue = url_queue
        self.data_queue = data_queue
    def run(self):
        while True:
            if self.url_queue.empty():#待爬取的url为空时,终止爬取
                break
            url = self.url_queue.get()
            self.parse_url(url)
    def parse_url(self,url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
        }
        res = requests.get(url,headers=headers).content.decode()
        text = etree.HTML(res)
        lists = text.xpath('//div[@class="j-r-list"]/ul/li')
        for list in lists:
            zuozhe = list.xpath('.//div[1]/div[2]/a/text()')
            neirong = list.xpath('.//div[2]/div[1]/a/text()')
            item = {
                "作者":zuozhe,
                "内容":neirong
            }
            items.append(item)
        self.data_queue.put(items)
def main():
    url_queue = Queue()
    data_queue = Queue()
    for x in range(11):
        url = 'http://www.budejie.com/text/%s'% x
        url_queue.put(url)
    for x in range(5):
        t = Parse_url(url_queue,data_queue)
        t.start()
    for x in range(5):
        t1 = Save_data(url_queue,data_queue,flock)
        t1.start()
if __name__ == '__main__':
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值