import requests,lxml,os
from bs4 import BeautifulSoup
header = {
"Host": 'bj.lianjia.com',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"}
def get_url(url):
res=requests.get(url,headers=header)
if res.status_code==200:
return res.text
else:
print(url,header)
return print(res.status_code)
def get_all_url(html):
soup=BeautifulSoup(html,'lxml')
title=soup.select('div.info.clear .title a')
all_url=[]
for i in title:
href = i.get('href')
all_url.append(href)
return all_url
def parser_info(info_html):
soup=BeautifulSoup(info_html,'lxml')
title = soup.select('.title .main')[0].text
total = soup.select('span.total')
pv = soup.select('.unitPriceValue')
name = soup.select('a.info')
base = soup.select('.base .content')
transaction = soup.select('.transaction .content')
l_trans=soup.select('.transaction .content li')
data= ', '.join([
title,
total[0].text+"万",
pv[0].text,
name[0].text,
str(base[0].text).strip()+'\n',
l_trans[0].text.strip()+'\n',
l_trans[1].text.strip()+'\n',
l_trans[2].text.strip()+'\n',
l_trans[3].text.strip()+'\n',
l_trans[4].text.strip()+'\n',
(l_trans[5].text).strip()+'\n',
(l_trans[6].text).split()[0]+'\n',
(l_trans[6].text).split()[1]+'\n',
l_trans[7].text.strip()+'\n'+'\n'
])
return data
def save_f(data):
f=open('Lian_info.txt','a',encoding='GBK')
f.write(data)
def main(url):
pass
if __name__=='__main__':
url='https://bj.lianjia.com/ershoufang/'
html=get_url(url)
get_url(url)
all_urls=get_all_url(html)
for url in all_urls:
info_html=get_url(url)
parser_info(info_html)
data = parser_info(info_html)
save_f(data)
main(url)