4.8学习笔记
一、复习
from threading import Thread
import time
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from threading import current_thread
# 1.直接通过Thread创建子线程
# Thread(target=需要在子线程中调用的函数,args=(函数的实参列表))
def download(url):
print(f'=========={current_thread()}=========')
print(f'{url}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{url}下载结束:{datetime.now()}')
# 1) 在主线程中下载三个电影:需要时间是三个电影的时间的叠加
# download('肖生克的救赎')
# download('摔跤吧,爸爸')
# download('霸王别姬')
# 2) 多线程下载三个电影
# t1 = Thread(target=download, args=('肖生克的救赎',))
# t2 = Thread(target=download, args=('摔跤吧,爸爸', ))
# t3 = Thread(target=download, args=('霸王别姬',))
# t1.start()
# t2.start()
# t3.start()
# 3) 线程池
pool = ThreadPoolExecutor(max_workers=2)
# a.一次添加一个任务(任务对应的函数可以是任何类型)
pool.submit(download, '肖生克的救赎')
# b.一次添加多个任务
pool.map(download, ['霸王别姬', '阿甘正传', '这个杀手不太冷', 'V字仇杀队', '沉默的羔羊'])
# 关闭线程池
pool.shutdown()
二、新知识
1.创建线程子类
from threading import Thread
from datetime import datetime
import time
class DownloadThread(Thread):
def __init__(self, url):
super().__init__()
self.url = url
def run(self) -> None:
# 确定需要在子线程中完成的任务
print(f'{self.url}开始下载:{datetime.now()}')
time.sleep(1)
print(f'{self.url}下载结束:{datetime.now()}')
t1 = DownloadThread('肖生克的救赎')
t2 = DownloadThread('阿甘正传')
t3 = DownloadThread('霸王别姬')
t1.start()
t2.start()
t3.start()
2.数据分析岗位爬取
import requests
from re import search
import json
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
from datetime import datetime
import csv
# 多线程的Queue
from queue import Queue
def an_data(html: str):
result = search(r'(?s)window.__SEARCH_RESULT__\s*=(.+?)</script>', html)
result_dict = json.loads(result.group(1))
return result_dict['engine_search_result']
def get_data(url):
# print(f'{url}开始!')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return an_data(response.text)
else:
print(response)
all_data = []
q = Queue()
def get_all():
pool = ThreadPoolExecutor(200)
page = 1
print(f'start:{datetime.now()}')
while page < 2000:
fu = pool.submit(get_data, f'https://search.51job.com/list/090200,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')
if not fu.result():
pool.shutdown()
print(f'end:{datetime.now()}')
# 方法二-2:写入所有的数据
# with open('files/数据分析2.csv', 'w', newline='', encoding='utf-8') as f:
# print('打开文件!')
# writer = csv.DictWriter(f, all_data[0].keys())
# writer.writeheader()
# writer.writerows(all_data)
# del all_data
# q.put('end')
break
page += 1
# 获取结果
data = fu.result()
# ==================存储数据==============
# 方案一: 得到一页数据就打开一次文件, 缺点:消耗额外的CPU资源
# data = fu.result()
# save1(data)
# 方案二-1:得到所有数据,然后将数据一次性写入文件中, 缺点:在一定时间内容内存消耗增加
# all_data.extend(data)
# 方案三:
# q.put(data)
pool.shutdown()
def save1(data):
with open('files/数据分析.csv', 'a', newline='', encoding='utf-8') as f:
print('打开文件')
writer = csv.DictWriter(f, list(data[0].keys()))
writer.writerows(data)
print('写入数据!')
def save2():
with open('files/数据分析3.csv', 'w', encoding='utf-8', newline='') as f:
print('=======打开文件')
first = q.get()
print('========写入文件')
writer = csv.DictWriter(f, first[0].keys())
writer.writeheader()
writer.writerows(first)
while True:
data = q.get()
if data == 'end':
break
writer.writerows(data)
print('========写入文件')
get_all()
3.线程和进程间通信
from threading import Thread, current_thread
from multiprocessing import Process, current_process
# 同一个进程中的多个线程数据可以直接共享,不同进程中的数据不能直接共享
# =====================多线程之间的数据通信============================
# a = []
# def func1():
# a.append('A')
# def func2():
# a.append('B')
# t1 = Thread(target=func1)
# t2 = Thread(target=func2)
# t1.start()
# t2.start()
# t1.join()
# t2.join()
# print(a)
# =====================多进程之间的数据通信============================
a = []
def func1(data1, data2):
print('进程1:', current_process())
a.append('A')
def func2(data1):
print('进程2:', current_process())
a.append('B')
if __name__ == '__main__':
print('主进程:', current_process())
# 创建进程对象
p1 = Process(target=func1, args=(100, 200))
p2 = Process(target=func2, args=(111,))
p1.start()
p2.start()
p1.join()
p2.join()
print('任务全部完成:', a)
4.进程间有效通信
import time
from multiprocessing import Process
# 多进程的队列
from multiprocessing import Queue
# 使用多进程队列中需要注意:1.将队列定义全局变量 2.队列对象必须通过在子进程中调用的函数的参数传递到进程中去
def download(name, queue: Queue):
print(f'{name}开始下载!')
time.sleep(1)
print(f'{name}下载完成!')
# return name+'数据'
# queue.put(name+'数据') # 往进程中添加数据
queue.put([f'{name}10', f'{name}20'])
if __name__ == '__main__':
q = Queue()
p1 = Process(target=download, args=('肖生克的救赎', q))
p2 = Process(target=download, args=('阿甘正传', q))
p1.start()
p2.start()
# p1.join()
# p2.join()
# 获取两个进程产生的数据
print(q.get()) # 获取队列中的数据;如果获取的时候队列中没有数据会阻塞线程一直等,等到有数据或者超时为止
print(q.get())
# print(q.get())
# print(q.get(timeout=2))
5.线程间有效通信
# 线程间通信:定义一个全局的容器,在子线程的函数中直接在全局容器中添加数据
from queue import Queue
from threading import Thread
import time
def download(name):
print(f'{name}开始下载')
time.sleep(1)
print(f'{name}下载结束')
q.put(f'{name}数据')
if __name__ == '__main__':
q = Queue()
t1 = Thread(target=download, args=('沉默的羔羊',))
t2 = Thread(target=download, args=('怦然心动',))
t1.start()
t2.start()
# t1.join()
# t2.join()
print(q.get())
print(q.get())
6.线程池
from multiprocessing import Pool
import time
def download(name):
print(f'{name}开始下载')
time.sleep(1)
print(f'{name}下载结束')
if __name__ == '__main__':
# 1.创建进程池对象
pool = Pool(maxtasksperchild=3)
# 2.添加任务
# 1)添加单个任务
# apply - 添加同步执行
# apply_async - 添加异步执行
pool.apply_async(download, args=('肖生克的救赎',))
pool.apply_async(download, args=('触不可及',))
# 2)同时添加多个任务
# pool.map(download, ['V字仇杀队', '恐怖游轮', '林中小屋'])
# 3.关闭
pool.close() # 停止添加任务
# pool.apply(download, args=('摔跤吧,爸爸',))
# 4. 等待
pool.join()
print('===========全部完成===========')
三、多网站数据爬取
import requests
import multiprocessing as mu
from multiprocessing import Process, Queue
from threading import Thread
import time
import json
from re import search
from concurrent.futures import ThreadPoolExecutor
# mac进程通信出问题解决方案!
mu.set_start_method('fork')
# 第一个进程:爬51解析51
def an_51job(html: str):
result = search(r'(?s)window.__SEARCH_RESULT__\s*=(.+?)</script>', html)
result_dict = json.loads(result.group(1))
return result_dict['engine_search_result']
def get_one_51_page(page):
url = f'https://search.51job.com/list/090200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
}
response = requests.get(url, headers=headers)
response.encoding = 'UTF-8'
if response.status_code == 200:
return an_51job(response.text)
else:
print(response)
def get_51job_data(queue):
pool = ThreadPoolExecutor(max_workers=100)
page = 1
while True:
fu = pool.submit(get_one_51_page, page)
if not fu.result():
pool.shutdown()
break
page += 1
# print(fu.result())
print('第一个进程获取到数据!')
queue.put(fu.result())
# 第二个进程:爬猎聘和解析猎聘
def get_liepin_data(queue: Queue):
url = 'https://www.liepin.com/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&siTag=ZFDYQyfloRvvhTxLnVV_Qg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=2a6eebb9379e46503e6d3fe03889d956&d_curPage=0&d_pageSize=40&d_headId=2a6eebb9379e46503e6d3fe03889d956&curPage=0'
# 第三个进程:保存所有数据
def save_all_data(queue: Queue):
while True:
print('第三个进程:', queue.get())
pass
if __name__ == '__main__':
q = Queue()
print('------------')
p1 = Process(target=get_51job_data, args=(q, ))
p2 = Process(target=get_liepin_data, args=(q,))
p3 = Process(target=save_all_data, args=(q,))
p1.start()
p2.start()
p3.start()