from bs4 import BeautifulSoup
import requests
import csv
import time
from fake_useragent import UserAgent
ua = UserAgent(verify_ssl=False) #随机代理
url = "http://cd.58.com/pinpaigongyu/pn/{page}/?minprice=600_1000" #600到1000的区间
#已完成的页数序号,初时为0
page = 0
#先把csv文件打开
csv_file = open("rent.csv","w")
csv_writer = csv.writer(csv_file, delimiter=',')
headers={
'User-Agent': str(ua.random),
'Referer': 'https://cd.58.com/pinpaigongyu/?minprice=600_1000',
}
while True:
page += 1
print('正在抓取第%s页……'%page)
print("fetch: ", url.format(page=page))
response = requests.get(url.format(page=page),headers=headers)
html = BeautifulSoup(response.text)
house_list = html.select(".list > li")
# 循环在读不到新的房源时结束
if not house_list:
break
for house in house_list:
house_title = house.select("h2")[0].string #获取名字
house_url = "http://bj.58.com/%s"%(house.select("a")[0]["href"]) #获取url
house_info_list = house_title.split()
# 如果第二列是公寓名则取第一列作为地址,数据处理
if "公寓" in house_info_list[1] or "青年社区" in house_info_list[1]:
house_location = house_info_list[0]
else:
house_location = house_info_list[1]
house_money = house.select(".money")[0].select("b")[0].string #价格
csv_writer.writerow([house_title, house_location, house_money, house_url]) #追加写入csv
time.sleep(1)
csv_file.close() #最后关闭csv文件
1、三个文件
crawl.py,rent.csv与index.html
2、分析页面--爬取房源(py)--存储文件(csv)--地图展示(html)
2.1分析页面 url
https://cd.58.com/pinpaigongyu/pn/3/?minprice=1000_1500
规律,指到超链接,会出现url,多看几个就有规律了。
https://cd.58.com/pingpaigongyu/pn/{page}/minprice={min_rent}_{max_rent}
3、高德地图注册 Web端(JS API)
服务器打开查看 python3 -m http.server 3000
……