day3-代理和css选择器解析库

本文介绍了如何使用代理IP抓取豆瓣电影排行榜,并展示了如何利用BeautifulSoup库进行HTML解析。通过get_proxy_ips函数获取代理,以及在get_net_data中循环使用代理并解析页面内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

01-获取代理ip

import requests

def get_proxy_ips():
    api='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    response = requests.get(api)
    if response.status_code ==200:
        if response.text[0] == '{':
            print('获取代理失败,提取太频繁')
        else:
            return response.text.split('\n')[:-1]
    else:
        print('请求失败!')


def get_net_data():
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
    }
    ips = get_proxy_ips()
    if ips:
        proxies={
            'http':ips[0],  # 'http':'ip地址:端口号‘
            'https':ips[0]
        }
        response = requests.get(url, headers=headers,proxies=proxies)

        print(response.text)
    else:
        print('没有成功获取到代理')


if __name__ == '__main__':
    # get_proxy_ips()
    get_net_data()

02-使用代理程序优化

import requests
import time
def get_proxy_ips():
    api='http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
    response = requests.get(api)
    if response.status_code ==200:
        if response.text[0] == '{':
            print('获取代理失败,提取太频繁')
        else:
            return response.text.split('\n')[:-1]
    else:
        print('请求失败!')


def get_net_data():
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
    }
    while True:
        # 获取5个代理ip
        ips = get_proxy_ips()  # 迭代器
        # 如果没有取到
        if not ips:
            time.sleep(1)
            continue
        # 取到以后将包含5个ip的列表转换成迭代器
        # ips = iter(ips)

        for ip in ips:
        # ip = next(ips)
            proxies={
                'http':ip,  # 'http':'ip地址:端口号‘
                'https':ip
            }
            try:
                response = requests.get(url, headers=headers,proxies=proxies)
                if response.status_code == 200:
                    # print(response.text)
                    return response.text
                else:
                    print('数据请求失败!')
            except requests.exceptions.ProxyError:
                print('超时,继续请求')


if __name__ == '__main__':
    # get_proxy_ips()
    result = get_net_data()
    print(result)

03 -bs4的使用

"""
Time:2021/5/26  11:28
Author:Spectre
"""
import requests
from bs4 import BeautifulSoup

def get_net_data(url):
    # 获取网络数据
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
    # }
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(response)


def analysis_data(data :str):
    # 1.创建解析器的对象
    # BeautifulSoup(需要解析的html字符串,解析器名称)
    bs = BeautifulSoup(data,'lxml')  # 指向整个网页
    print(bs)
    # 2.根据css选择器获取标签
    # select(css选择器)  - 获取选择器中的所有标签
    # select_one(css选择器)  - 获取选择器中的第一个标签
    result = bs.select('#p1')
    print(result,len(result),type(result[0]))
    # [<p class="title" id="p1" name="dromouse"><b>The Dormouse's story</b></p> 1 <class 'bs4.element.Tag'>
    result1 = bs.select('p')
    print(result1, len(result1),type(result1[0]))
    # [<p class="title" id="p1" name="dromouse"><b>The Dormouse's story</b></p>] 3 <class 'bs4.element.Tag'>

    result2 = bs.select_one('p')
    print(result2, len(result2))

    # 3.获取标签内容
    # 1)标签对象.string  -  获取标签内文字内容 ,若有多个子标签,结果是None,return 字符串
    # 2)标签对象.get_text() - 标签内文字内容,如果有子标签,将子标签中的内容一起获取。 return 字符串
    # 3)标签对象.contents  - 获取标签中的文字和标签,返回值是列表


    p1 = bs.select_one('#p1')
    print(p1.string)   # The Dormouse's story

    p2 = bs.select_one('div>p')
    print(p2.get_text())   # 这是一个段部落

    p3 = bs.select_one('#p3')
    print(p2.contents)  # ['这是', <b>一</b>, '个段部落']


    # 4.获取标签属性
    img= bs.select_one('img')
    print(img.attrs)  # {'src': './01.jpg'}   # 字典
    print(img.attrs['src'])  # ./01.jpg

    a = bs.select('a')  # select返回列表
    # print(a)
    print(a[0].attrs['href'])  # http://example.com/elsie

    #5.在指定标签中获取子标签
    # 标签对象.select(css选择器) -- 获取指定标签中选择器选中的所有标签
    # 标签对象.select_one(css选择器) - 获取指定标签中选择器选中的第一个标签
    print('所有P标签:',len(bs.select('p')))  # 所有P标签: 5

    div = bs.select_one('div')
    result = div.select('p')
    print('div中的p标签:',len(result))   # div中的p标签: 2








if __name__ == '__main__':
    # data = get_net_data('')
    data = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p id='p1' class="title" name="dromouse"><b>The Dormouse's story</b></p>
            <img src='./01.jpg' />
            <p class="story">Once upon a time there were three little sisters; and their names were
            <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
            and they lived at the bottom of a well.</p>
            <p class="story">...</p>
            <div>
                <p id='p3'>这是<b>一</b>个段部落</p>
                <p>这是一个段部落</p>
            </div>
                
        </body>
    </html>
        
    """
    # </body></html> 会补全
    if data:
        analysis_data(data)

04-贝壳网

"""
Time:2021/5/26  15:17
Author:Spectre
"""


import requests
from bs4 import BeautifulSoup

def get_net_data(url):
    response = requests.get(url)
    # result = response.text
    # print(result))
    if response.status_code == 200:
        return response.text
    else:
        print(response)


def analysis_data(data :str):
    bs = BeautifulSoup(data, 'lxml')  # 指向整个网页

    # 所有楼盘信息对应的li标签
    house_li = bs.select('.resblock-list-wrapper>li')
    # print(house_li)
    all_house = []
    for li in house_li:
        house={}

        img_src = li.select_one('.lj-lazy').attrs['data-original']
        house['img'] = img_src
        name = li.select_one('.name').get_text()
        house['name'] = name
        price = li.select_one('.main-price').get_text().replace('\n','').replace('\xa0','')
        house['price'] = price
        location = li.select_one('.resblock-location').get_text().strip()  # 去掉前后空格
        house['location'] = location

        all_house.append(house)
        # print(location)

    print(all_house)





if __name__ == '__main__':
    data = get_net_data('https://cd.fang.ke.com/loupan/pg1/')
    if data:
        analysis_data(data)
    else:
        print('error')

05-csv文件操作

"""
Time:2021/5/26  16:58
Author:Spectre
"""

import csv

#  ============将数据写入csv文件中======================
# 1)用列表提供csv数据
# 1.创建一个writer
# csv.writer()  - 以列表为单位写入一行数据
# csv.Dicwriter()  - 以字典为单位写入一行数据

# newline-不空行
 with open('files/test.csv','w',newline='',encoding='utf-8') as f:
     writer = csv.writer(f)


# # 2.写入数据
     writer.writerow(['姓名','性别','年龄','分数'])
     writer.writerows([
         ['张三','男',28,98],
         ['小明','男',28,98],
         ['小花','女',18,88],
         ['小张','男',8,98],
     ])


# 2) 用字典提供数据
with open('files/test02.csv','w',newline='',encoding='utf-8') as f:
     writer = csv.DictWriter(f,['name','gender','age','score'])
     # 第一行内容
     writer.writerow({'name':'姓名','gender':'性别','age':'年龄','score':'分数'})
     # writer.writeheader()  # 字典的key直接写入
     # 写一行
     # writer.writerow({'name': '张三', 'gender': '男', 'age': '18', 'score': '98'})
     # 同时写多行
     writer.writerows([
         {'name': '张三', 'gender': '男', 'age': '18', 'score': '98'},
         {'name': 'ded', 'gender': '男', 'age': '18', 'score': '98'},
         {'name': 'rfe', 'gender': '男', 'age': '18', 'score': '98'},
         {'name': 'frrw', 'gender': '男', 'age': '18', 'score': '98'}
     ])



# ===============读取csv文件内容======================
# 注意:任意一个csv文件都可以选中使用列表或者字典的方式器读
# 1.一行数据对应一个列表
# with open('files/test02.csv','r',newline='',encoding='utf-8') as f:
#     # reader就是一个迭代器,迭代器中的元素是每一行元素对应的列表
#     reader = csv.reader(f)   # 迭代器,生成器
#     # print(next(reader))  # ['姓名', '性别', '年龄', '分数']
#     next(reader)
#     print(list(reader))

# 2.一行数据对应一个字典
with open('files/test.csv','r',newline='',encoding='utf-8') as f:

    reader = csv.DictReader(f)
    for x in reader:
        print(dict(x))
    print(list(reader))

作业-爬取豆瓣电影排行榜

"""
Time:2021/5/26  15:11
Author:Spectre
"""
import requests
from bs4 import BeautifulSoup
import csv

def get_net_data(url):
    # 获取网络数据
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66'
    }
    response = requests.get(url,headers=headers)
    # result = response.text
    # print(result))
    if response.status_code == 200:
        return response.text
    else:
        print(response)


def analysis_data(data :str):
    bs = BeautifulSoup(data, 'lxml')  # 指向整个网页
    movie_li = bs.select('.indent .item')
    # print(movie_li)
    all_movie = []
    for li in movie_li:
        movie = {}
        img_src = li.select_one('img').attrs['src']
        movie['img'] = img_src
        name = li.select_one('.pl2>a').get_text().strip().replace(' ','').replace('\n','')
        movie['name'] = name
        a = li.select_one('a').attrs['href']
        movie['link'] = a
        intro = li.select_one('.pl').get_text()
        movie['intro'] = intro
        star = li.select_one('.rating_nums').get_text()
        movie['star'] = star
        comments = li.select_one('.star>.pl').get_text()[1:-1]
        movie['comments'] = comments

        all_movie.append(movie)
    return all_movie





if __name__ == '__main__':
    data = get_net_data('https://movie.douban.com/chart')
    if data:
        result = analysis_data(data)
    else:
        print('error')

    with open('files/doubanmovie.csv','w',newline='',encoding='utf-8') as f:
        writer = csv.DictWriter(f,['img','name','link','intro','star','comments'])
        # 第一行内
        writer.writerow({'img':'电影封面','name':'电影名称','link':'电影链接','intro':'上映时间/主演','star':'评分','comments':'评论数'})

        # 同时写多行
        for x in result:
            writer.writerows([
                {'img': x['img'], 'name': x['name'], 'link': x['link'],'intro': x['intro'],'star':x['star'], 'comments':x['comments'],}
            ])


    with open('files/doubanmovie.csv','r',newline='',encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for x in reader:
            print(dict(x))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值