读入网站
import requests
from bs4 import BeautifulSoup
r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
soup=BeautifulSoup(r.content)
soup保存网站数据
提取div——class;
提取h4——class;
删除多余的\n和空格;
import requests
from bs4 import BeautifulSoup
r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
soup=BeautifulSoup(r.content)
all = soup.find_all("div",{"class":"propertyRow"}) #得到所有class为propertyrow的div 类似list格式
#len(all) #10个结果
price=all[0].find_all("h4",{"class":"propPrice"})
price #得到一个list
all[0].find("h4",{"class":"propPrice"}).text #得到text string类型
all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'') #replace去除\n 以及" "空格
Try_except:
得到每个房子的info;
bed、area、bath等;
从span+class中提取;
try——except防止无数据报错;
for i in all: #遍历得到所有price
print(i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",''))
print(i.find_all("span",{"class":"propAddressCollapse"})[0].text)
print(i.find_all("span",{"class":"propAddressCollapse"})[1].text)
#找到bed个数
try: #防止error
print(i.find("span",{"class":"infoBed"}).find("b").text) #text error 因为有none try解决
except:
print(None)
#找到area个数
try: #防止error
print(i.find("span",{"class":"infoSqFt"}).find("b").text) #text error 因为有none try解决
except:
print(None)
try: #防止error
print(i.find("span",{"class":"infoValueFullBath"}).find("b").text) #text error 因为有none try解决
except:
print(None)
try: #防止error
print(i.find("span",{"class":"infoValueHalfBath"}).find("b").text) #text error 因为有none try解决
except:
print(None)
print(" ")
#print(i.find_all("span",{"class":"infoBed"}))
添加代码,zip遍历两个list,得到feature信息
若增加条件判断,可得到特定条件的feature信息;
#得到feature 信息
for column_group in i.find_all("div",{"class":"columnGroup"}):
#print(column_group)
#继续iterator
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})): #zip遍历两个list
#print(feature_group.text,feature_name.text)
if "Lot Size" in feature_group.text: #若 feature_group中存在Lot size属性,输出该属性的name
print(feature_name.text)
将数据放入DataFrame中
建立数组,存放字典;
字典为 feature:value
#加入dictionary
#dictionary保存在list中
l=[]
for i in all: #遍历得到所有price
d={}
d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
#找到bed个数
try: #防止error
d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text #text error 因为有none try解决
except:
d["Beds"]=None
#找到area个数
try: #防止error
d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text #text error 因为有none try解决
except:
d["Area"]=None
try: #防止error
d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text #text error 因为有none try解决
except:
d["Full Baths"]=None
try: #防止error
d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text #text error 因为有none try解决
except:
d["Half Baths"]=None
#得到feature 信息
for column_group in i.find_all("div",{"class":"columnGroup"}):
#print(column_group)
#继续iterator
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})): #zip遍历两个list
#print(feature_group.text,feature_name.text)
if "Lot Size" in feature_group.text: #若 feature_group中存在Lot size属性,输出该属性的name
d["Lot Size"]=feature_name.text
#得到dictionary
l.append(d)
#print(i.find_all("span",{"class":"infoBed"}))
将字典转为DataFrame:
to_csv保存为csv格式的文件;
import pandas
df=pandas.DataFrame(l)
df.to_csv("out_put.csv") #转为csv格式 可用excel打开
进行翻页抓取
requests.get得到网页html代码;
soup将代码格式化;
find_all得到代码中的指定标签(类似list类型);
page_nr为页码数;
import requests
from bs4 import BeautifulSoup
r=requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
soup=BeautifulSoup(r.content)
all = soup.find_all("div",{"class":"propertyRow"}) #得到所有class为propertyrow的div 类似list格式
#price=all[0].find_all("h4",{"class":"propPrice"})
#得到一个list
#all[0].find("h4",{"class":"propPrice"}).text #得到text string类型
#all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'') #replace去除\n 以及" "空格
page_nr=soup.find_all("a",{"class":"Page"})[-1].text
翻页抓取:
通过for循环得到每次的url;
每次传入url得到all;
对all进行遍历抓取;
#加入dictionary
#dictionary保存在list中
l=[]
base_url="http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
for page in range(0,int(page_nr)*10,10): #每次+10
print(base_url+str(page)+".html")
r=requests.get(base_url+str(page)+".html",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
c=r.content
soup=BeautifulSoup(c,"html.parser")
#print(soup.prettify())
all = soup.find_all("div",{"class":"propertyRow"}) #得到所有class为propertyrow的div 类似list格式
for i in all: #遍历得到所有price
d={}
d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
try:
d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
except:
d["Locality"]=None
d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
#找到bed个数
try: #防止error
d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text #text error 因为有none try解决
except:
d["Beds"]=None
#找到area个数
try: #防止error
d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text #text error 因为有none try解决
except:
d["Area"]=None
try: #防止error
d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text #text error 因为有none try解决
except:
d["Full Baths"]=None
try: #防止error
d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text #text error 因为有none try解决
except:
d["Half Baths"]=None
#得到feature 信息
for column_group in i.find_all("div",{"class":"columnGroup"}):
#print(column_group)
#继续iterator
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})): #zip遍历两个list
#print(feature_group.text,feature_name.text)
if "Lot Size" in feature_group.text: #若 feature_group中存在Lot size属性,输出该属性的name
d["Lot Size"]=feature_name.text
#得到dictionary
l.append(d)
保存至csv文件
#加入 numpy array
#将l加入dataframe
import pandas
df=pandas.DataFrame(l)
df.to_csv("out_put.csv") #转为csv格式 可用excel打开
完整代码
import requests
from bs4 import BeautifulSoup
import pandas
#加入dictionary
#dictionary保存在list中
l=[]
base_url="http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
for page in range(0,int(page_nr)*10,10): #每次+10
print(base_url+str(page)+".html")
r=requests.get(base_url+str(page)+".html",headers={"User-agent":'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:61.0)Gecko/20100101 Firefox/61.0'})
c=r.content
soup=BeautifulSoup(c,"html.parser")
#print(soup.prettify())
all = soup.find_all("div",{"class":"propertyRow"}) #得到所有class为propertyrow的div 类似list格式
for i in all: #遍历得到所有price
d={}
d["Address"]=i.find_all("span",{"class":"propAddressCollapse"})[0].text
try:
d["Locality"]=i.find_all("span",{"class":"propAddressCollapse"})[1].text
except:
d["Locality"]=None
d["Price"]=i.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ",'')
#找到bed个数
try: #防止error
d["Beds"]=i.find("span",{"class":"infoBed"}).find("b").text #text error 因为有none try解决
except:
d["Beds"]=None
#找到area个数
try: #防止error
d["Area"]=i.find("span",{"class":"infoSqFt"}).find("b").text #text error 因为有none try解决
except:
d["Area"]=None
try: #防止error
d["Full Baths"]=i.find("span",{"class":"infoValueFullBath"}).find("b").text #text error 因为有none try解决
except:
d["Full Baths"]=None
try: #防止error
d["Half Baths"]=i.find("span",{"class":"infoValueHalfBath"}).find("b").text #text error 因为有none try解决
except:
d["Half Baths"]=None
#得到feature 信息
for column_group in i.find_all("div",{"class":"columnGroup"}):
#print(column_group)
#继续iterator
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})): #zip遍历两个list
#print(feature_group.text,feature_name.text)
if "Lot Size" in feature_group.text: #若 feature_group中存在Lot size属性,输出该属性的name
d["Lot Size"]=feature_name.text
#得到dictionary
l.append(d)
#l
df=pandas.DataFrame(l)
df.to_csv("out_put.csv") #转为csv格式 可用excel打开

本文介绍了一种使用Python的requests和BeautifulSoup库进行网页爬取的方法,详细展示了如何从房地产网站上抓取房屋信息,包括地址、价格、卧室数量、面积等,并将这些数据整理成DataFrame格式,最后保存为CSV文件。此外,还讲解了如何处理多页数据抓取。
906

被折叠的 条评论
为什么被折叠?



