python 应用之 -- 爬虫

本文介绍如何使用Python的requests库获取网页内容并利用BeautifulSoup进行解析,以链家数据爬取为例,详细阐述了应对反爬虫策略的步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

requests 获取页面内容,BeautifulSoop解析

  1. 以爬取链家数据为例:
import requests
from bs4 import BeautifulSoup
url = "https://bj.lianjia.com/zufang/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
# requests.head(url)
reponse = requests.get(url)
reponse

<Response [403]>

  1. 上述出现反爬虫,解决如下:
    a.获取网页
from urllib.request import urlopen
from urllib.request import urlopen
from urllib import request
# 1 --- urlopen(req).read().decode('utf-8') 
url = "https://bj.lianjia.com/zufang/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
req = request.Request(url, headers=headers)
#print(urlopen(req).read().decode('utf-8'))
#格式化
soup = BeautifulSoup(urlopen(req),'lxml')
soup

b.解析网页:

from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib import error
def get_page(url):
    '''
    desc -- 获取url页面内容,返回soup对象
    '''
#     headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
#     req = request.Request(url, headers=headers)
    try:
        req = urlopen(url)
        if req.getcode()==200:
            print(req.getcode())
            soup = BeautifulSoup(req,'lxml')
        else:
            soup == None
    #     soup = BeautifulSoup(urlopen(req).read().decode('utf-8'),'lxml')
    except error.URLError as e:
        if hasattr(e,'code'):
            print("HTTPError")
            print(e.code)
        elif hasattr(e,'reason'):
            print("URLError")
            print(e.reason)
    except ValueError as e:
        return 0
    return soup

# 2 --- soup = BeautifulSoup(urlopen(url),'lxml')
def get_links(link_url):
    '''
    desc -- 获取列表页面下面的所有租房连接列表
    '''
    soup = get_page(link_url)
    if soup ==0:return 0
    links_div = soup.find_all('a',attrs={'class':'content__list--item--aside'})
    links = ['https://bj.lianjia.com'+div.get('href') for div in links_div]
    return links




def get_house_info(house_info_url):
    
    soup = get_page(house_info_url)
    if soup == 0:return
    # price = soup.find('p',attrs={'class':'content__aside--title'}).find('span').get_text()
    price = soup.find('p',attrs={'class':'content__aside--title'}).get_text()

    house_info = soup.find('p',attrs={'class':'content__article__table'}).find_all('span')
    #房屋基本信息
    house_info2 =soup.find('div',attrs={'class':'content__article__info'}).find_all('li')
    #配套设备息
    #house_info3 =soup.find('div',attrs={'class':'content__article__info2'}).find_all('li')
    # 房源描述
    
    house_info4 =soup.find('div',attrs={'class':'content__article__info3'}
                          ).find('p',attrs={'data-el':'houseComment'})
    if house_info4 == None:
        subway = None
        community = None
    else:
        house_info4 = house_info4.get_text()
        subway = house_info4.strip().split('\n')[0].split(" ")[1]
        community = house_info4.strip().split('\n')[-1].split(" ")[-1]
    # 地址和交通
    house_info5 = soup.find('div',attrs={'class':'content__article__info4'}
                           ).find_all('li')
    location_info = soup.find('div',attrs={'class':'bottom__list'}
                           ).find('span').get_text().strip().split('\n')
    location = ''.join([x.strip(' ') for x in location_info])
    area = house_info[2].get_text()
    floor = house_info2[7].get_text()[3:]
    layout =house_info[1].get_text() + ' ' + house_info[0].get_text()
    create_time = house_info2[1].get_text()[3:]
    direction =  house_info[-1].get_text()
   
    # location = [(x.get_text().strip().strip('\n')) for x in house_info5]
    agent = soup.find('div',attrs={'class':'desc'}
                     ).find('a',attrs={'class':'name'}).get_text()
    agent_phone = soup.find('div',attrs={'class':'desc'}
                     ).find('div',attrs={'class':'phone'}).get_text()
    info = {
        '价格':price,
        '户型':layout,
        '面积':area,
        '楼层':floor,
        '朝向':direction,
        '地铁':subway,
        '小区':community,
        '位置':location,
        '发布日期':create_time,
        '经纪人名称':agent,
        '经纪人电话':agent_phone
    }
    return info

c.连接数据库:

import pymysql
import time
DATABASE = {
    "host" : "127.0.0.1",
    "database" : "Examination",
    "user" : "root",
    "password" : "root",
    "charset" : "utf8"# 防止乱码
}
def get_db(setting):
    return pymysql.connect(**setting)

def insert(db,house_info):
    values = "'{}',"*10+"'{}'"
    sql_values = values.format(house_info['价格'],house_info['户型'],house_info['面积'],
                              house_info['楼层'],house_info['朝向'],house_info['地铁'],
                              house_info['小区'],house_info['位置'],house_info['发布日期'],
                              house_info['经纪人名称'],house_info['经纪人电话'])
    sql = """
        insert into `house` (`price`,`layout`,`area`,`floor`,`direction`,
        `subway`,`community`,`location`,`create_time`,`agent`,`agent_phone`) 
        values({})
    """.format(sql_values)
    print(sql,'----sql')
    cursor = db.cursor()
    re = cursor.execute(sql)
    print(re,'-------------result----------')
    db.commit()
    

db = get_db(DATABASE)
for x in range(2,101):
    url = "https://bj.lianjia.com/zufang/"+"pg"+str(x)+"/"
    links = get_links(url)
    for link in links:
        time.sleep(2)
        if link == 0:
            continue
        print(link,'====link')
        house = get_house_info(link)
        print(house,end='\r')
        if house == None:
            continue
        insert(db,house)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值