主要使用工具:request,xpath
根据安居客城市列表进行爬取,本人只进行首字母开头N-S城市租房信息的爬取。
爬虫的主要思路:
1、根据url请求页面,返回页面数据
2、解析页面内容,抓取我们所需数据信息
安居客租房信息爬取思路:
1、先根据城市列表url获取到每个城市的url链接
2、在根据城市url获取该城市的租房链接,拼接为完整的url
(注意:并不是每个城市都有租房信息,注意异常处理或判断)
3、当我们拿到所有的城市租房链接,就可以根据链接进行请求,返回页面数
4、使用xpath或正则爬取我们所需的信息
安居客租房信息(二手房信息正常),页面数字内容大多都做了字体的加密,无法直接提取,需要使用其他方法进行解密,这一点需要注意
from fontTools.ttLib import TTFont
import base64
import io
import html
from lxml import etree
# key='''
# AAEAAAALAIAAAwAwR1NVQiCLJXoAAAE4AAAAVE9TLzL4XQjtAAABjAAAAFZjbWFwq75/aQAAAhAAAAIuZ2x5ZuWIN0cAAARYAAADdGhlYWQWkilPAAAA4AAAADZoaGVhCtADIwAAALwAAAAkaG10eC7qAAAAAAHkAAAALGxvY2ED7gSyAAAEQAAAABhtYXhwARgANgAAARgAAAAgbmFtZTd6VP8AAAfMAAACanBvc3QFRAYqAAAKOAAAAEUAAQAABmb+ZgAABLEAAAAABGgAAQAAAAAAAAAAAAAAAAAAAAsAAQAAAAEAAOXFRuRfDzz1AAsIAAAAAADZhW8fAAAAANmFbx8AAP/mBGgGLgAAAAgAAgAAAAAAAAABAAAACwAqAAMAAAAAAAIAAAAKAAoAAAD/AAAAAAAAAAEAAAAKADAAPgACREZMVAAObGF0bgAaAAQAAAAAAAAAAQAAAAQAAAAAAAAAAQAAAAFsaWdhAAgAAAABAAAAAQAEAAQAAAABAAgAAQAGAAAAAQAAAAEERAGQAAUAAAUTBZkAAAEeBRMFmQAAA9cAZAIQAAACAAUDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFBmRWQAQJR2n6UGZv5mALgGZgGaAAAAAQAAAAAAAAAAAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAABLEAAASxAAAEsQAAAAAABQAAAAMAAAAsAAAABAAAAaYAAQAAAAAAoAADAAEAAAAsAAMACgAAAaYABAB0AAAAFAAQAAMABJR2lY+ZPJpLnjqeo59kn5Kfpf//AACUdpWPmTyaS546nqOfZJ+Sn6T//wAAAAAAAAAAAAAAAAAAAAAAAAABABQAFAAUABQAFAAUABQAFAAUAAAACAAHAAMABQAKAAIABAAGAAEACQAAAQYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADAAAAAAAiAAAAAAAAAAKAACUdgAAlHYAAAAIAACVjwAAlY8AAAAHAACZPAAAmTwAAAADAACaSwAAmksAAAAFAACeOgAAnjoAAAAKAACeowAAnqMAAAACAACfZAAAn2QAAAAEAACfkgAAn5IAAAAGAACfpAAAn6QAAAABAACfpQAAn6UAAAAJAAAAAAAAACgAPgBmAJoAvgDoASQBOAF+AboAAgAA/+YEWQYnAAoAEgAAExAAISAREAAjIgATECEgERAhIFsBEAECAez+6/rs/v3IATkBNP7S/sEC6AGaAaX85v54/mEBigGB/ZcCcwKJAAABAAAAAAQ1Bi4ACQAAKQE1IREFNSURIQQ1/IgBW/6cAicBWqkEmGe0oPp7AAEAAAAABCYGJwAXAAApATUBPgE1NCYjIgc1NjMyFhUUAgcBFSEEGPxSAcK6fpSMz7y389Hym9j+nwLGqgHButl0hI2wx43iv5D+69b+pwQAAQAA/+YEGQYnACEAABMWMzI2NRAhIzUzIBE0ISIHNTYzMhYVEAUVHgEVFAAjIiePn8igu/5bgXsBdf7jo5CYy8bw/sqow/7T+tyHAQN7nYQBJqIBFP9uuVjPpf7QVwQSyZbR/wBSAAACAAAAAARoBg0ACgASAAABIxEjESE1ATMRMyERNDcjBgcBBGjGvv0uAq3jxv58BAQOLf4zAZL+bgGSfwP8/CACiUVaJlH9TwABAAD/5gQhBg0AGAAANxYzMjYQJiMiBxEhFSERNjMyBBUUACEiJ7GcqaDEx71bmgL6/bxXLPUBEv7a/v3Zbu5mswEppA4DE63+SgX42uH+6kAAAAACAAD/5gRbBicAFgAiAAABJiMiAgMzNjMyEhUUACMiABEQACEyFwEUFjMyNjU0JiMiBgP6eYTJ9AIFbvHJ8P7r1+z+8wFhASClXv1Qo4eAoJeLhKQFRj7+ov7R1f762eP+3AFxAVMBmgHjLfwBmdq8lKCytAAAAAABAAAAAARNBg0ABgAACQEjASE1IQRN/aLLAkD8+gPvBcn6NwVgrQAAAwAA/+YESgYnABUAHwApAAABJDU0JDMyFhUQBRUEERQEIyIkNRAlATQmIyIGFRQXNgEEFRQWMzI2NTQBtv7rAQTKufD+3wFT/un6zf7+AUwBnIJvaJLz+P78/uGoh4OkAy+B9avXyqD+/osEev7aweXitAEohwF7aHh9YcJlZ/7qdNhwkI9r4QAAAAACAAD/5gRGBicAFwAjAAA3FjMyEhEGJwYjIgA1NAAzMgAREAAhIicTFBYzMjY1NCYjIga5gJTQ5QICZvHD/wABGN/nAQT+sP7Xo3FxoI16pqWHfaTSSgFIAS4CAsIBDNbkASX+lf6l/lP+MjUEHJy3p3en274AAAAAABAAxgABAAAAAAABAA8AAAABAAAAAAACAAcADwABAAAAAAADAA8AFgABAAAAAAAEAA8AJQABAAAAAAAFAAsANAABAAAAAAAGAA8APwABAAAAAAAKACsATgABAAAAAAALABMAeQADAAEECQABAB4AjAADAAEECQACAA4AqgADAAEECQADAB4AuAADAAEECQAEAB4A1gADAAEECQAFABYA9AADAAEECQAGAB4BCgADAAEECQAKAFYBKAADAAEECQALACYBfmZhbmdjaGFuLXNlY3JldFJlZ3VsYXJmYW5nY2hhbi1zZWNyZXRmYW5nY2hhbi1zZWNyZXRWZXJzaW9uIDEuMGZhbmdjaGFuLXNlY3JldEdlbmVyYXRlZCBieSBzdmcydHRmIGZyb20gRm9udGVsbG8gcHJvamVjdC5odHRwOi8vZm9udGVsbG8uY29tAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AFIAZQBnAHUAbABhAHIAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAZgBhAG4AZwBjAGgAYQBuAC0AcwBlAGMAcgBlAHQAVgBlAHIAcwBpAG8AbgAgADEALgAwAGYAYQBuAGcAYwBoAGEAbgAtAHMAZQBjAHIAZQB0AEcAZQBuAGUAcgBhAHQAZQBkACAAYgB5ACAAcwB2AGcAMgB0AHQAZgAgAGYAcgBvAG0AIABGAG8AbgB0AGUAbABsAG8AIABwAHIAbwBqAGUAYwB0AC4AaAB0AHQAcAA6AC8ALwBmAG8AbgB0AGUAbABsAG8ALgBjAG8AbQAAAAIAAAAAAAAAFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACwECAQMBBAEFAQYBBwEIAQkBCgELAQwAAAAAAAAAAAAAAAAAAAAA
# '''
# data = base64.b64decode(key) #base64解码
# fonts = TTFont(io.BytesIO(data)) #生成二进制字节
# fonts.save('fangchan-secret.ttf')
# cmap=fonts.getBestCmap() #十进制ascii码到字形名的对应
# for char in cmap:
# print(char,hex(char),chr(char),cmap[char])
# 38006 0x9476 鑶 glyph00008
# 38287 0x958f 閏 glyph00007
# 39228 0x993c 餼 glyph00003
# 39499 0x9a4b 驋 glyph00005
# 40506 0x9e3a 鸺 glyph00010
# 40611 0x9ea3 麣 glyph00002
# 40804 0x9f64 齤 glyph00004
# 40850 0x9f92 龒 glyph00006
# 40868 0x9fa4 龤 glyph00001
# 40869 0x9fa5 龥 glyph00009
#使用字典批量替换
import re
def multReplace(response, chrMapNum):
rx = re.compile('|'.join(map(re.escape, chrMapNum)))
return rx.sub(lambda match:chrMapNum[match.group(0)], response)
def decodeAnjuke(response,key):
glyphdict = {
'glyph00001': '0',
'glyph00002': '1',
'glyph00003': '2',
'glyph00004': '3',
'glyph00005': '4',
'glyph00006': '5',
'glyph00007': '6',
'glyph00008': '7',
'glyph00009': '8',
'glyph00010': '9'
}
fonts = None
data = base64.b64decode(key) #base64解码
try:
fonts = TTFont(io.BytesIO(data)) #生成二进制字节
except:
pass
cmap = fonts.getBestCmap() #十进制ascii码到字形名的对应{38006:'glyph00002',...}
chrMapNum = {} #将变为{‘龥’:'1',...}
for asc in cmap:
chrMapNum[chr(asc)] = glyphdict[cmap[asc]]
return multReplace(response,chrMapNum)
def result(response):
key = ""
text = html.unescape(response) # 将閏室变为閏室
# key = re.findall(r"base64,(.*?)'\).format", text)[0] # 用正则表达式提取AAE..AAA
try:
key = re.findall(r"charset=utf-8;base64,(.*?)'\)", text)[0]
except:
pass
# xpath提取
# tree = etree.HTML(response)
# script_text = tree.xpath('//head/script/text()')[0]
# key = re.findall(r"charset=utf-8;base64,(.*?)'\) format", script_text, re.I)
# key = "".join(key)
# re提取
# key = re.findall(r'base64,(.*?)',text,re.S)[0]# 用正则表达式提取AAE..AAA
dehtml = decodeAnjuke(text, key)
return dehtml
# with open('ajk_sh.html', 'r',encoding='utf-8') as f:
# # text = html.unescape(f.read()) #将閏室变为閏室
# # key = re.findall(r"base64,(.*)'\).format", text)[0] #用正则表达式提取AAE..AAA
# # dehtml = decode58Fangchan(text, key)
# # print(dehtml)
# content = f.read()
# tree = etree.HTML(content)
# script_text = tree.xpath('//head/script/text()')[0]
# # key = re.search(script_text).group(1)
# pattern = re.compile(r"base64,(.*?)'")
# # key = re.search(script_text).group(1)
# key = re.findall(r"charset=utf-8;base64,(.*?)'\) format",script_text,re.I)[0]
# print(key)
# # print(key)
主要爬取的信息有:
# 字段:
图片image
标题title
室(数字)bedroom_num
厅(数字)living_room_num
面积area
楼层floor
总楼层floors
经纪人agent
小区名neighborhood
城区city_area
商圈bussiness_area
地址address
租房方式rent_way
朝向face_direction
地铁线subline
价格price
# 由于最终爬取的数据要存储到数据库中以便后续维护查询,添加一个数据保存的时间字段
保存时间save_time
完整代码:(有一些瑕疵,一些异常没有做完善,还有就是ip代理的问题,爬取时间可能一长就会报错,中断爬取,如果有心人士爬虫能力高的话,可以完善,多线程也可以加上)
import re
import io
import csv
import time
import json
import base64
import random
import pymysql
import datetime
import requests
from lxml import etree
from fontTools.ttLib import TTFont
from fake_useragent import UserAgent
class Down_Mysql():
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
db='anjuke',
user='root',
password='123',
charset = 'utf8'
)
self.cursor = self.connect.cursor() # 创建游标
# 保存到数据库
def save_mysql(self,image,title,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,save_time):
sql = "insert into zufang(image,title,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,save_time) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
try:
self.cursor.execute(sql, (image,title,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,city_area,bussiness_area,address,rent_way,face_direction,subline,price,save_time))
self.connect.commit()
print('数据插入成功')
except Exception as e:
print(e)
print("插入数据失败")
class Anjuke():
# 判断是否为空
def is_empty(self, data):
if len(data) > 0:
return data[0]
else:
return ''
# 写入csv文件
def write_csv(self,dict_info, filename):
'''
将所有数据写入到csv文档中
'''
list_info = ['image','title','bedroom_num','living_room_num','area','floor','floors','agent','neighborhood',
'city_area','bussiness_area','address','rent_way','face_direction','subline','price','save_time']
with open(filename, 'a+', encoding='utf-8', newline="") as csv_info:
# 创建csv写对象
csv_w = csv.writer(csv_info)
info = [str(dict_info.get(item)) for item in list_info]
print(info)
with open(filename, "r", encoding='utf-8', newline="") as csv_info:
# 创建csv读对象
reader = csv.reader(csv_info)
if not [row for row in reader]:
csv_w.writerow(list_info)
csv_w.writerow(info)
else:
csv_w.writerow(info)
# 字体解密
def fonts_parse(self,response):
# 定义一个空字典 {'0x9476': 8,....}
newmap = dict()
try:
# 获取加密字符串
key_str = re.search(r"base64,(.*?)'\)",response).group(1)
# base64解码
b = base64.b64decode(key_str)
# 生成二进制字节
font = TTFont(io.BytesIO(b))
# #十进制ascii码到字形名的对应{38006:'glyph00002',...}
bestcmap = font['cmap'].getBestCmap()
# 定义一个空字典 {'0x9476': 8,....}
# newmap = dict()
for key in bestcmap.keys():
value = int(re.search(r'(\d+)', bestcmap[key]).group(1)) - 1 # 对应数字0-9
key = hex(key) # 十进制转换为十六进制
newmap[key] = value # 保存到字典
# print(newmap)
except:
pass
# 把页面上自定义字体替换成正常字体
response_ = response
for key,value in newmap.items():
# 拼接成抓到到的网页中的字体内容 ,龒龥
key_ = key.replace('0x','&#x') + ';'
if key_ in response_:
response_ = response_.replace(key_,str(value))
return response_
# 请求url,返回html页面
def get_url(self,url):
# ua池
# ua = UserAgent()
# headers = {
# 'User-Agent': ua.random
# }
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
proxies_list = [
{
'http': 'http://122.114.222.93:8888',
},
]
proxies = random.choice(proxies_list)
print(proxies)
response = requests.get(url,headers=headers,proxies=proxies).text
# tree = etree.HTML(response)
return response
# 获取安居客N-S首字母开头的城市链接
def request_city_list(self):
print('''
1.官网获取城市链接
2.本地文件获取城市链接
''')
num = input("请选择获取城市链接的方式“:")
while True:
global response
if num == '1':
url = 'https://www.anjuke.com/sy-city.html'
response = self.get_url(url)
break
elif num == '2':
with open("city_list.html",'r',encoding='utf-8')as fp:
response = fp.read()
break
# print(response)
# 再进行结构转换
tree = etree.HTML(response)
# 获取N(13)-S(17)的城市url
city_list_ul = tree.xpath('//div[@class="letter_city"]//li[position()<18][position()>12]')
city_dict = {}
for city_list in city_list_ul:
# name = city_list.xpath('./label/text()')
city_name_list = city_list.xpath('.//a/text()')
city_name_url_list = city_list.xpath('.//a/@href')
for city,city_url in zip(city_name_list,city_name_url_list):
# print(city,city_url)
city_dict[city] = city_url
# print(city_dict)
return city_dict
# 根据获取到的城市url获取判断该城市有无租房信息,返回城市租房链接(字典)
def get_zufang_url(self):
city_zufang_dict = {}
# 使用函数 返回N-S城市url,字典
city_dict = self.request_city_list()
for city in city_dict.keys():
# 城市url
city_url = city_dict[city]
# 使用函数,请求返回文本页面
response = self.get_url(city_url)
tree = etree.HTML(response)
# 获取该城市的租房url
zufang_url = None
house_list = tree.xpath('//li[@class="li_single li_itemsnew li_unselected"][a="租 房"]/a/@href')
if house_list:
zufang_url = "".join(house_list)
city_zufang_dict[city] = zufang_url
print("已抓取到[%s]的租房链接"%city)
else:
zufang_url = "[%s]无租房信息"%city
print(zufang_url)
time.sleep(1)
# with open('zufang_url_list.json', 'w',encoding='utf-8') as fp:
# json.dump(city_zufang_dict,fp,ensure_ascii=False)
print(len(city_zufang_dict)) # 71
return city_zufang_dict
# 获取房源信息
def get_fangyuan(self,tree,city):
# 房源块信息
div_list = tree.xpath('//div[@class="zu-itemmod"]')
# print(len(div_list))
div_page = tree.xpath('//div[@class="page-content"]//i[@class="iNxt"]/text()')
# print(div_page)
for div in div_list:
data_info = {}
# 1.租房图片链接
image_url = div.xpath('.//img[@class="thumbnail"]/@lazy_src')[0]
# print(image_url)
# 2.标题
title = div.xpath('./div[@class="zu-info"]//b/text()')[0]
print(title)
# 3.室
bedroom_num = div.xpath('.//p[@class="details-item tag"]/b[1]/text()')[0]
# bedroom_num = int(bedroom_num)
print(bedroom_num)
# 4.厅
living_room_num_list = div.xpath('.//p[@class="details-item tag"]/b[2]/text()')
if living_room_num_list:
living_room_num = living_room_num_list[0]
# living_room_num = int(living_room_num)
else:
living_room_num = None
# print(living_room_num)
# 5.面积
area = div.xpath('.//p[@class="details-item tag"]/b[last()]/text()')[0]
area = area + "m²"
# print(area)
# 6.楼层
floor = div.xpath('.//p[@class="details-item tag"]/text()')[4].strip()
floor1 = floor.split('(',)
if len(floor1) > 1:
floor1 = floor1[0]
else:
floor1 = None
# print(floor1)
# 7.总楼层
floors = re.findall(r'\d+',floor)[0]
# print(floors)
# 8.经纪人
try:
agent = div.xpath('.//p[@class="details-item tag"]/text()')
agent = agent[len(agent)-1].strip()
except:
agent = ""
print(agent)
# print(agent)
# 9.小区名
neighborhood = div.xpath('.//address/a/text()')
neigh = "".join(neighborhood)
# print(neigh)
# 10.城区
try:
city_area_list = div.xpath('.//address/text()')[1].strip().split(' ')
city_area_lst = city_area_list[0].split('-')
city_area = city_area_lst[0]
# print(city_area)
except:
city_area = ""
# 11.商圈
try:
bussiness_area = city_area_lst[1]
# print(bussiness_area)
except:
bussiness_area = ""
# 12.地址
try:
address = city_area_list[1]
except:
address = ""
# print(address)
# 13.租房方式
rent_way = div.xpath('.//p[@class="details-item bot-tag"]/span[1]/text()')
rent_way = self.is_empty(rent_way)
# print(rent_way)
# 14.朝向
face_direction = div.xpath('.//p[@class="details-item bot-tag"]/span[2]/text()')
face_direction = self.is_empty(face_direction)
# print(face_direction)
# 15.地铁线
subline = div.xpath('.//p[@class="details-item bot-tag"]//span[contains(text(),"线")]/text()')
subline = "".join(subline)
# print(subline)
# 16.价格
price = div.xpath('./div[@class="zu-side"]//b/text()')
price = self.is_empty(price) + "元/月"
# print(price)
# 17.数据存储时间
save_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 以csv文件方式存储
# data_info['image'] = image_url
# data_info['title'] = title
# data_info['bedroom_num'] = bedroom_num
# data_info['living_room_num'] = living_room_num
# data_info['area'] = area
# data_info['floor'] = floor1
# data_info['floors'] = floors
# data_info['agent'] = agent
# data_info['neighborhood'] = neigh
# data_info['city_area'] = city_area
# data_info['bussiness_area'] = bussiness_area
# data_info['address'] = address
# data_info['rent_way'] = rent_way
# data_info['face_direction'] = face_direction
# data_info['subline'] = subline
# data_info['price'] = price
# data_info['save_time'] = save_time
# self.write_csv(data_info,"%s_fangyuan.csv"%city)
# mysql存储,实例化数据库
db = Down_Mysql()
db.save_mysql(image_url,title,bedroom_num,living_room_num,area,floor1,floors,agent,neigh,city_area,bussiness_area,address,rent_way,face_direction,subline,price,save_time)
return div_page
# 拼接完整的页面租房链接,进行房源信息的爬取
def split_page_url(self):
page_url = {}
print('''
1.官网获取城市租房链接
2.本地文件获取城市租房链接
''')
num = input("请选择获取城市租房链接的方式:")
while True:
global zufang_url_dict
if num == '1':
# ①页面获取
zufang_url_dict = self.get_zufang_url()
break
elif num == '2':
# ②本地获取
with open('zufang_url_list.json','r',encoding='utf-8')as fp:
zufang_url_dict = json.load(fp)
break
# print(zufang_url_dict)
for city in zufang_url_dict.keys():
# 城市租房链接
city_zufang_url = zufang_url_dict[city]
print(city_zufang_url)
# 拼接完整的租房url地址
for page in range(1,51):
city_zufang_page_url = city_zufang_url + 'fangyuan/p{}/'.format(page)
print(f"========================正在爬取{city}第{page}页数据==================================")
# page_url[city] = city_zufang_page_url
page_response = self.get_url(city_zufang_page_url)
# 字体解密
parse_response = self.fonts_parse(page_response)
parse_tree = etree.HTML(parse_response)
# 获取房源信息
next = self.get_fangyuan(parse_tree,city)
if next:
break
print(f"========================成功爬取{city}第{page}页数据==================================")
time.sleep(1)
if __name__ == '__main__':
anjuke = Anjuke()
anjuke.split_page_url()
数据表保存结果展示:

数据库保存结果展示:

本文介绍了一个针对安居客网站的租房信息爬虫项目,详细讲解了如何使用Python的request和xpath库来抓取各城市租房数据,包括图片、标题、价格等,并通过字体解密处理加密的数字信息,最后将数据保存至数据库。
2556

被折叠的 条评论
为什么被折叠?



