python -- 爬虫
requests 获取页面内容,BeautifulSoop解析
- 以爬取链家数据为例:
import requests
from bs4 import BeautifulSoup
url = "https://bj.lianjia.com/zufang/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
# requests.head(url)
reponse = requests.get(url)
reponse
<Response [403]>
- 上述出现反爬虫,解决如下:
a.获取网页
from urllib.request import urlopen
from urllib.request import urlopen
from urllib import request
# 1 --- urlopen(req).read().decode('utf-8')
url = "https://bj.lianjia.com/zufang/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
req = request.Request(url, headers=headers)
#print(urlopen(req).read().decode('utf-8'))
#格式化
soup = BeautifulSoup(urlopen(req),'lxml')
soup
b.解析网页:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib import error
def get_page(url):
'''
desc -- 获取url页面内容,返回soup对象
'''
# headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
# req = request.Request(url, headers=headers)
try:
req = urlopen(url)
if req.getcode()==200:
print(req.getcode())
soup = BeautifulSoup(req,'lxml')
else:
soup == None
# soup = BeautifulSoup(urlopen(req).read().decode('utf-8'),'lxml')
except error.URLError as e:
if hasattr(e,'code'):
print("HTTPError")
print(e.code)
elif hasattr(e,'reason'):
print("URLError")
print(e.reason)
except ValueError as e:
return 0
return soup
# 2 --- soup = BeautifulSoup(urlopen(url),'lxml')
def get_links(link_url):
'''
desc -- 获取列表页面下面的所有租房连接列表
'''
soup = get_page(link_url)
if soup ==0:return 0
links_div = soup.find_all('a',attrs={'class':'content__list--item--aside'})
links = ['https://bj.lianjia.com'+div.get('href') for div in links_div]
return links
def get_house_info(house_info_url):
soup = get_page(house_info_url)
if soup == 0:return
# price = soup.find('p',attrs={'class':'content__aside--title'}).find('span').get_text()
price = soup.find('p',attrs={'class':'content__aside--title'}).get_text()
house_info = soup.find('p',attrs={'class':'content__article__table'}).find_all('span')
#房屋基本信息
house_info2 =soup.find('div',attrs={'class':'content__article__info'}).find_all('li')
#配套设备息
#house_info3 =soup.find('div',attrs={'class':'content__article__info2'}).find_all('li')
# 房源描述
house_info4 =soup.find('div',attrs={'class':'content__article__info3'}
).find('p',attrs={'data-el':'houseComment'})
if house_info4 == None:
subway = None
community = None
else:
house_info4 = house_info4.get_text()
subway = house_info4.strip().split('\n')[0].split(" ")[1]
community = house_info4.strip().split('\n')[-1].split(" ")[-1]
# 地址和交通
house_info5 = soup.find('div',attrs={'class':'content__article__info4'}
).find_all('li')
location_info = soup.find('div',attrs={'class':'bottom__list'}
).find('span').get_text().strip().split('\n')
location = ''.join([x.strip(' ') for x in location_info])
area = house_info[2].get_text()
floor = house_info2[7].get_text()[3:]
layout =house_info[1].get_text() + ' ' + house_info[0].get_text()
create_time = house_info2[1].get_text()[3:]
direction = house_info[-1].get_text()
# location = [(x.get_text().strip().strip('\n')) for x in house_info5]
agent = soup.find('div',attrs={'class':'desc'}
).find('a',attrs={'class':'name'}).get_text()
agent_phone = soup.find('div',attrs={'class':'desc'}
).find('div',attrs={'class':'phone'}).get_text()
info = {
'价格':price,
'户型':layout,
'面积':area,
'楼层':floor,
'朝向':direction,
'地铁':subway,
'小区':community,
'位置':location,
'发布日期':create_time,
'经纪人名称':agent,
'经纪人电话':agent_phone
}
return info
c.连接数据库:
import pymysql
import time
DATABASE = {
"host" : "127.0.0.1",
"database" : "Examination",
"user" : "root",
"password" : "root",
"charset" : "utf8"# 防止乱码
}
def get_db(setting):
return pymysql.connect(**setting)
def insert(db,house_info):
values = "'{}',"*10+"'{}'"
sql_values = values.format(house_info['价格'],house_info['户型'],house_info['面积'],
house_info['楼层'],house_info['朝向'],house_info['地铁'],
house_info['小区'],house_info['位置'],house_info['发布日期'],
house_info['经纪人名称'],house_info['经纪人电话'])
sql = """
insert into `house` (`price`,`layout`,`area`,`floor`,`direction`,
`subway`,`community`,`location`,`create_time`,`agent`,`agent_phone`)
values({})
""".format(sql_values)
print(sql,'----sql')
cursor = db.cursor()
re = cursor.execute(sql)
print(re,'-------------result----------')
db.commit()
db = get_db(DATABASE)
for x in range(2,101):
url = "https://bj.lianjia.com/zufang/"+"pg"+str(x)+"/"
links = get_links(url)
for link in links:
time.sleep(2)
if link == 0:
continue
print(link,'====link')
house = get_house_info(link)
print(house,end='\r')
if house == None:
continue
insert(db,house)