笔记20210526代理和css解析库-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_44160560/article/details/117301530

代理和css解析库

获取代理ip

import requests
# 获取代理
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    # print(res.text)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败，提取频繁')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('代理获取失败')

# 解析数据
def get_net_data():
    url = 'https://movie.douban.com/top250'
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
    }
    # 代理
    ips = get_proxy_ips()
    if ips:
        proxies = {
            'http': ips[0],   # 'http': 'ip地址：端口号'
            'https': ips[1]
        }
        res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
        if res.status_code == 200:
            print(res.text)
        else:
            print('请求失败')
    else:
        print('没有成功取到代理')


if __name__ == '__main__':
    get_proxy_ips()
    get_net_data()

使用代理的程序优化

import requests
import time
def get_proxy_ips():
    api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
    res = requests.get(api)
    if res.status_code == 200:
        if res.text[0] == '{':
            print('获取代理失败，提取频繁')
        else:
            return res.text.split('\n')[:-1]
    else:
        print('代理获取失败')


def get_net_data(url):
    headers ={
        'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
    }
    while True:
        ips = get_proxy_ips()
        if not ips:
            print('ip获取失败')
            time.sleep(1)
            continue

        # 取到后将包含5个ip的列表转换成一个
        ips = iter(ips)
        for ip in ips:
            proxies = {
                'http': ip,  # 'http': 'ip地址：端口号'
                'https': ip
            }
            try:
                res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                if res.status_code == 200:
                    print(res.text)
                    return res.text
                else:
                    print('ip请求失败！')
            except requests.exceptions.ConnectTimeout:
                print('超时继续获取')


if __name__ == '__main__':
    get_net_data('https://lib.swpu.edu.cn/')

bs4的使用

import requests
from bs4 import BeautifulSoup
def get_net_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
    }
    response = requests.get(url)
    if response.status_code == 200:
        return response.text

    else:
        print('失败')


def analysis_data(data: str):
    # 1.创建解析器对象
    # BeautifulSoup(需要解析的html字符串，'lxml')
    bs = BeautifulSoup(data, 'lxml')
    # print(bs)

    # 2.根据css选择器获取标签
    # select(css标签)——获取选择器选中所有标签
    # select_one(css选择器)——获取选择器选中的第一个标签
    result = bs.select('div>p')
    res = bs.select_one('div>p')
    print(result)
    print(res)

    # 3.获取标签内容
    # 标签.string —— 获取标签的文字内容（如果标签内容有多个子标签或者同是存在文字和子标签，结果是None），返回值是字符串。
    # 标签对象.get_text() —— 获取标签的文字内容（如果有子标签，会把子标签的文字内容一起取出来），返回值是字符串。
    # 标签对象.contents —— 获取标签中文字和子标签，返回值是列表。
    p1 = bs.select_one('div>p')
    print('p1-string', p1.string)   # p1-string 11111
    print('p1-text:', p1.get_text())   # p1-text: 11111
    print('contents:', p1.contents)   # contents: ['11111']

    p2 = bs.select_one('#p1')
    print(p2)   # <p class="title" id="p1" name="dromouse"><b>The Dormouse's story</b></p>
    print('p2-string:', p2.string)  # p2-string: The Dormouse's story
    print('p2-text:', p2.get_text())   # p2-text: The Dormouse's story
    print('p2-contents:', p2.contents)   # p2-contents: [<b>The Dormouse's story</b>]

    p3 = bs.select_one('#p2')
    print('p3:', p3.string)  # None
    print('p3-text', p3.get_text())  # 文字1文字2
    print('p3-contents:', p3.contents)   # p3-contents: ['文字1', <b>文字2</b>]


    # 4.获取标签属性
    # 标签对象.attrs——返回包含属性名和属性值的字典
    # 标签对象.attrs[属性名]——返回属性值
    img = bs.select_one('img')
    print(img.attrs)   # {'src': 'E:\\liudehua.jpg'}
    print(img.attrs['src'])   # E:\liudehua.jpg

    a = bs.select('a')
    print(a[-1].attrs['href'])   # www.baidu.com


    # 5.在指定标签中获取子标签
    # 标签对象.select(css选择器)——获取指定标签中选择器选中的所有标签
    # 标签对象.select_one(css选择器)——获取指定标签中选择器选中的第一个标签
    # print('所有的p标签：', bs.select('p'))

    div = bs.select_one('div')
    result = div.select('p')
    print('div中p标签：', result)   # div中p标签： [<p>11111</p>, <p>22222</p>, <p id="p2">文字1<b>文字2</b></p>]




if __name__ == '__main__':
    # get_net_data('https://lib.swpu.edu.cn/')
    data = """
                <html>
                    <head>
                        <title>The Dormouse's story</title>
                    </head>
                    <body>
                        <p id="p1" class="title" name="dromouse"><b>The Dormouse's story</b></p>
                        <p class="story">Once upon a time there were three little sisters; and their names were
                        <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
                        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
                        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
                        and they lived at the bottom of a well.</p>
                        <p class="story">...</p>
                        <div>
                            <p>11111</p>
                            <p>22222</p>
                            <a href="www.baidu.com">百度一下</a>
                            <img src="E:\liudehua.jpg">
                            <p id="p2">文字1<b>文字2</b></p>
                        </div>
"""
    analysis_data(data)

csv文件操作

import csv
# ========================将文件写入csv中=======================
# '''
# 1.创建writer
# csv.writer(文件对象)——以列表为单位，写入一行数据
# csv.DicWriter——以字典为单位写入一行数据
with open('files/test.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)

# 2.写入数据
    writer.writerow(['姓名', '性别', '年龄', '分数'])
    writer.writerows([
        ['张三', '男', 26, 89],
        ['小红', '女', 22, 78],
        ['李四', '男', 21, 90]
    ])
# '''

# 2.用字典提供数据
with open('files/test2.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, ['name', 'age', 'gender', 'score'])
    # 第一行内容
    # 方式1
    writer.writerow({'name':'姓名','age':'年龄', 'gender':'性别', 'score':'分数' })   # '姓名','年龄','性别','分数'
    # 方式2
    writer.writeheader()   # 'name', 'age', 'gender', 'score'

    # 写一行
    writer.writerow({'name':'andy','age':18, 'gender':'男', 'score':96})

    writer.writerows([
        {'name':'july','age':18, 'gender':'男', 'score':96},
        {'name':'peter','age':22, 'gender':'男', 'score':88},
        {'name':'jack','age':19, 'gender':'男', 'score':90}
    ])


# ===================读取csv文件内容=============================
# 注意：任意一个csv文件都可以选择使用列表或者字典的方式
# 1.一行数据对应一个列表
with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
    # reader就是每一行内容对应的迭代器。（reader是一个迭代器，迭代器中的元素是每一行内容对应的列表）
    reader = csv.reader(f)
    next(reader)
    print(list(reader))   # [['张三', '男', '26', '89'], ['小红', 'nv', '22', '78'], ['李四', '男', '21', '90']]


# 2.一行数据对应一个字典
with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    print(next(reader))   # {'姓名': '张三', '性别': '男', '年龄': '26', '分数': '89'}