import requests
import csv
from lxml import etree
import threading
from queue import Queue
flock = threading.Lock()#创建一个锁对象
items = []
class Save_data(threading.Thread):
def __init__(self,url_queue,data_queue,flock):
super().__init__() # 必须得先调用父类方法
self.url_queue = url_queue
self.data_queue = data_queue
self.flock = flock
def run(self):
while True:
if self.url_queue.empty() and self.data_queue.empty(): # 待爬取的url为空时,终止爬取
break
items = self.data_queue.get()
self.save_data(items)
def save_data(self,items):
title = ["作者","内容"]
self.flock.acquire()#写入文件的时候上锁
with open('duanzi1.csv','w',encoding='utf8',newline='')as fp:
writer = csv.DictWriter(fp,title)#写入字典的方式
#写入表头数据的时候,需要调用writeheader方法
writer.writeheader()#写完文件释放锁
writer.writerows(items)
self.flock.release()#
class Parse_url(threading.Thread):
def __init__(self,url_queue,data_queue):
super().__init__() # 必须得先调用父类方法
self.url_queue = url_queue
self.data_queue = data_queue
def run(self):
while True:
if self.url_queue.empty():#待爬取的url为空时,终止爬取
break
url = self.url_queue.get()
self.parse_url(url)
def parse_url(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
}
res = requests.get(url,headers=headers).content.decode()
text = etree.HTML(res)
lists = text.xpath('//div[@class="j-r-list"]/ul/li')
for list in lists:
zuozhe = list.xpath('.//div[1]/div[2]/a/text()')
neirong = list.xpath('.//div[2]/div[1]/a/text()')
item = {
"作者":zuozhe,
"内容":neirong
}
items.append(item)
self.data_queue.put(items)
def main():
url_queue = Queue()
data_queue = Queue()
for x in range(11):
url = 'http://www.budejie.com/text/%s'% x
url_queue.put(url)
for x in range(5):
t = Parse_url(url_queue,data_queue)
t.start()
for x in range(5):
t1 = Save_data(url_queue,data_queue,flock)
t1.start()
if __name__ == '__main__':
main()