代码
# coding=utf-8
import multiprocessing
import os
from pyquery import PyQuery as pq
import requests
import csv
import time
def action(key):
head = {
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
for i in range(1,50):
page = []
url = 'https://www.yuanjisong.com/consultant/allcity/' + key + '/page' + str(i)
response = requests.get(url,headers=head)
doc = pq(response.text)
boxs = doc('.weui_panel.weui_panel_access.weui_panel_access_adapt.db_adapt.margin-top-2').items()
for box in boxs:
box_text = box.text()
page.append(box_text.split('\n'))
path = 'I:/'+ key + '.csv'
with open(path,'a',encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerows(page)
print('保存' + key + '第' + str(i) + '页!')
单进程测试代码:
if __name__ == '__main__':
a = time.time()
action('python')
action('java')
print('it\'over!')
print('单进程花费时间',time.time() - a)
花费时间:
双进程测试代码
if __name__ == '__main__':
a = time.time()
mp1 = multiprocessing.Process(target=action,args=('python',))
mp1.start()
mp2 = multiprocessing.Process(target=action,args=('java',))
mp2.start()
mp2.join()
print('it\'over!')
print('多进程花费时间',time.time() - a)
花费时间:
多进程池(数据量是刚才上面两个的10倍)
if __name__ == '__main__':
a = time.time()
keys =['java','php','python','node','net','ruby','cplus',\
'quanzhan','ios','android','qianduan','houduan','ui','chanpin','yunying','ceshi','other']
#数据成倍的增加
pool = multiprocessing.Pool(processes=len(keys))
pool.map(action,keys)
pool.close()
pool.join()
print('it\'s over!')
print('多进程花费时间',time.time() - a)
花费时间: