# coding:utf-8import requests
import re
import random
from bs4 import BeautifulSoup
from w3lib.html import remove_tags
import csv
useragents =[# 代理用户"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13","Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ","Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ","Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ","Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36","Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"]
header ={"User-Agent": random.choice(useragents),"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0","Accept-Encoding":"gzip, deflate,br","Connection":"keep-alive","Upgrade-Insecure-Requests":"1"}
area ={'tianhe','yuexiu','liwan','haizhu','panyu','baiyun','huangpugz','conghua','zengcheng','huadou','nansha'}for aa in area:
c =0for bb inrange(1,100):
tar_url ="https://gz.lianjia.com/zufang/"+ aa +"/pg"+str(bb)#某一地址的第几页数据
response = requests.get(url=tar_url, headers=header)
soup = BeautifulSoup(response.text,'html.parser')#print(tar_url) ##print(soup)
imf = soup.find_all('div', class_='content__list--item--main')#每个房子的信息模块try:for house in imf:#print(house)
region = house.find('p',{'class':'content__list--item--title twoline'}).find('a').get_text()# 地址#print(region)#拼接出某个房子的详情连接
detailURL ='https://gz.lianjia.com'+ house.find('p',{'class':'content__list--item--title twoline'}).find('a')['href']#print(detailURL)
zone = house.find('p',{'class':'content__list--item--des'}).find('a').get_text()# 所属区域#print(zone)
ditie = house.find('p',{'class':'content__list--item--des'}).findAll('a')[1].get_text()# 邻近的地铁站为#print(ditie)
xiaoqu = house.find('p',{'class':'content__list--item--des'}).findAll('a')[2].get_text()# 所属小区#print(xiaoqu)
daxiao = house.find('p',{'class':'content__list--item--des'})# 所属区域list=[]
a=1for da in daxiao:#print('123',da)
a+=1if a ==10:print('Mianji,', da)list.append(da)if a ==14:#if da.find('㎡') != -1: #面积大小list.append(da)#print(str(list[0]).replace('\n ',''))
mianji =str(list[0]).replace('\n ','')# 房子面积
tishi =str(list[1]).replace('\n ','')#庭室#print('面积和庭室:',list)
price = house.find('span',{'class':'content__list--item-price'}).find('em').get_text()+'元/月'#房租#print(price)
weihu = house.find('span',{'class':'content__list--item--time oneline'}).get_text()#维护状态#print(weihu)print('房子简介:',region,';所属地区:',zone,';小区:',xiaoqu,';庭室:',tishi,';面积:',mianji,';价格:',price,';临近地铁为:',ditie,':详情连接:', detailURL)#dis = house.find('div', {'class': 'con'}).get_text()#t = str(region) + str(zone) + str(s) + str(price) + ' ' + str(dis)#print(t)#with open("F://pc/链家广州租房全xinde.txt", "a", encoding='utf-8')as f:withopen('租房.csv','a')as f:
csv_writer = csv.writer(f, delimiter=',')
c+=1
csv_writer.writerow([str(c),str(region),str(zone),str(xiaoqu),str(tishi),str(mianji),str(price),str(ditie),str(detailURL)])except Exception as ex:print("出现如下异常%s"%ex)continue