项目主要爬取北上广深链家网全部租房房源数据,并且得出租金分布、租房考虑因素等建议。
首先奉上爬虫demo,如果有直接需要数据的请评论留言,会分享。
import os
import re
import time
import requests
from pymongo import MongoClient
from info import rent_type, city_info
class Rent(object):
"""
初始化函数,获取租房类型(整租、合租)、要爬取的城市分区信息以及连接mongodb数据库
"""
def __init__(self):
self.rent_type = rent_type
self.city_info = city_info
host = os.environ.get('MONGODB_HOST', '127.0.0.1') # 本地数据库
port = os.environ.get('MONGODB_PORT', '27017') # 数据库端口
mongo_url = 'mongodb://{}:{}'.format(host, port)
mongo_db = os.environ.get('MONGODB_DATABASE', 'Lianjia')
client = MongoClient(mongo_url)
self.db = client[mongo_db]
self.db['zufang'].create_index('m_url', unique=True) # 以m端链接为主键进行去重
def get_data(self):
"""
爬取不同租房类型、不同城市各区域的租房信息
:return: None
"""
for ty, type_code in self.rent_type.items(): # 整租、合租
for city, info in self.city_info.items(): # 城市、城市各区的信息
for dist, dist_py in info[2].items(): # 各区及其拼音
res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline \">"
bc_list = re.findall(pa_bc, res_bc.text)
self._write_bc(bc_list)
bc_list = self._read_bc() # 先爬取各区的商圈,最终以各区商圈来爬数据,如果按区爬,每区最多只能获得2000条数据
if len(bc_list) > 0:
for bc_name in bc_list:
idx = 0
has_more = 1
while has_more:
try:
url = 'https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}' \
'/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
bc_name,
type_code,
idx*30,
int(time.time()))
res = requests.get(url=url, timeout=10)
print('成功爬取{}市{}-{}的{}第{}页数据!'.format(city, dist, bc_name, ty, idx+1))
item = {
'city': city, 'type': ty, 'dist': dist}
self._parse_record(res.json()['data']['list'], item)
total = res.json()['data']['total']
idx += 1
if total/30 <= idx:
has_more = 0
# time.sleep(random.random())
except:
print('链接访问不成功,正在重试!')
def _parse_record(self, data