# -*- coding: utf-8 -*-
"""
Created on Sat Jan 20 18:08:21 2018
@author: Administrator
"""
import requests
from bs4 import BeautifulSoup
import time
from multiprocessing import Pool
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
def get_loupan(url):
try:
res=requests.get(url,headers=headers)
soup=BeautifulSoup(res.text,'html.parser')
titles=soup.find_all('span',class_='items-name')
title=list(map(lambda x:x.text,titles))
dizhis=soup.find_all('span',class_='list-map')
dizhi=list(map(lambda x:x.text,dizhis))
diqus=soup.find_all('span',class_='list-map')
diqu=list(map(lambda x:x.text.split('\xa0')[1],diqus))
mianjis_quan=soup.find_all('a',class_='huxing')
mianji_quan=list(map(lambda x:x.text,mianjis_quan))
mianjis=soup.find_all('a',class_='huxing')
mianji=list(map(lambda x:x.text.split('\t')[-1].strip(),mianjis))
jiages=soup.find_all('a',class_='favor-pos')
jiage=list(map(lambda x:x.p.text,jiages))
for tit,dizhi,diqu,mianq,mianj,jiage in zip(title,dizhi,diqu,mianji_quan,mianji,jiage):
info={'标题':tit,
'地址':dizhi,
'地区':diqu,
'面积(全)':mianq,
'面积':mianj,
'价格':jiage}
return info
except:
return ''
if __name__ == '__main__':
start_1=time.time()
for i in range(1,11):
url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
get_loupan(url)
time.sleep(1)
end_1=time.time()
print('串行爬虫:',end_1 - start_1)
start_2=time.time()
pool=Pool(processes=2)
for i in range(1,11):
url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
pool.map(get_loupan,url)
time.sleep(1)
end_2=time.time()
print('两个进程:',end_2 - start_2)
start_3=time.time()
pool=Pool(processes=4)
for i in range(1,11):
url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
pool.map(get_loupan,url)
time.sleep(1)
end_3=time.time()
print('四个进程:',end_3 - start_3)
start_4=time.time()
pool=Pool(processes=4)
urls=[]
for i in range(1,11):
url='https://sh.fang.anjuke.com/loupan/all/p{}/'.format(i)
urls.append(url)
pool.map(get_loupan,urls)
time.sleep(1)
end_4=time.time()
print('(四)个进程:',end_4 - start_4) python多进程爬取安居客
最新推荐文章于 2025-02-01 04:14:22 发布
本文介绍了一个使用Python的Requests和BeautifulSoup库实现的安居客房产信息爬虫。该爬虫可以抓取上海地区各楼盘的基本信息,包括标题、地址、地区、面积及价格等,并通过多进程方式加速数据抓取。
2609

被折叠的 条评论
为什么被折叠?



