笔记20210526代理和css解析库

代理和css解析库

  1. 获取代理ip

    import requests
    # 获取代理
    def get_proxy_ips():
        api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
        res = requests.get(api)
        # print(res.text)
        if res.status_code == 200:
            if res.text[0] == '{':
                print('获取代理失败,提取频繁')
            else:
                return res.text.split('\n')[:-1]
        else:
            print('代理获取失败')
    
    # 解析数据
    def get_net_data():
        url = 'https://movie.douban.com/top250'
        headers = {
            'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
        }
        # 代理
        ips = get_proxy_ips()
        if ips:
            proxies = {
                'http': ips[0],   # 'http': 'ip地址:端口号'
                'https': ips[1]
            }
            res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
            if res.status_code == 200:
                print(res.text)
            else:
                print('请求失败')
        else:
            print('没有成功取到代理')
    
    
    if __name__ == '__main__':
        get_proxy_ips()
        get_net_data()
    
  2. 使用代理的程序优化

    import requests
    import time
    def get_proxy_ips():
        api = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=5&expiryDate=0&format=2&newLine=3'
        res = requests.get(api)
        if res.status_code == 200:
            if res.text[0] == '{':
                print('获取代理失败,提取频繁')
            else:
                return res.text.split('\n')[:-1]
        else:
            print('代理获取失败')
    
    
    def get_net_data(url):
        headers ={
            'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
        }
        while True:
            ips = get_proxy_ips()
            if not ips:
                print('ip获取失败')
                time.sleep(1)
                continue
    
            # 取到后将包含5个ip的列表转换成一个
            ips = iter(ips)
            for ip in ips:
                proxies = {
                    'http': ip,  # 'http': 'ip地址:端口号'
                    'https': ip
                }
                try:
                    res = requests.get(url, headers=headers, proxies=proxies, timeout=2)
                    if res.status_code == 200:
                        print(res.text)
                        return res.text
                    else:
                        print('ip请求失败!')
                except requests.exceptions.ConnectTimeout:
                    print('超时继续获取')
    
    
    if __name__ == '__main__':
        get_net_data('https://lib.swpu.edu.cn/')
    
  3. bs4的使用

    import requests
    from bs4 import BeautifulSoup
    def get_net_data(url):
        headers = {
            'User-Agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/81.0.4044.129Safari/537.36'
        }
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    
        else:
            print('失败')
    
    
    def analysis_data(data: str):
        # 1.创建解析器对象
        # BeautifulSoup(需要解析的html字符串,'lxml')
        bs = BeautifulSoup(data, 'lxml')
        # print(bs)
    
        # 2.根据css选择器获取标签
        # select(css标签)——获取选择器选中所有标签
        # select_one(css选择器)——获取选择器选中的第一个标签
        result = bs.select('div>p')
        res = bs.select_one('div>p')
        print(result)
        print(res)
    
        # 3.获取标签内容
        # 标签.string —— 获取标签的文字内容(如果标签内容有多个子标签或者同是存在文字和子标签,结果是None),返回值是字符串。
        # 标签对象.get_text() —— 获取标签的文字内容(如果有子标签,会把子标签的文字内容一起取出来),返回值是字符串。
        # 标签对象.contents —— 获取标签中文字和子标签,返回值是列表。
        p1 = bs.select_one('div>p')
        print('p1-string', p1.string)   # p1-string 11111
        print('p1-text:', p1.get_text())   # p1-text: 11111
        print('contents:', p1.contents)   # contents: ['11111']
    
        p2 = bs.select_one('#p1')
        print(p2)   # <p class="title" id="p1" name="dromouse"><b>The Dormouse's story</b></p>
        print('p2-string:', p2.string)  # p2-string: The Dormouse's story
        print('p2-text:', p2.get_text())   # p2-text: The Dormouse's story
        print('p2-contents:', p2.contents)   # p2-contents: [<b>The Dormouse's story</b>]
    
        p3 = bs.select_one('#p2')
        print('p3:', p3.string)  # None
        print('p3-text', p3.get_text())  # 文字1文字2
        print('p3-contents:', p3.contents)   # p3-contents: ['文字1', <b>文字2</b>]
    
    
        # 4.获取标签属性
        # 标签对象.attrs——返回包含属性名和属性值的字典
        # 标签对象.attrs[属性名]——返回属性值
        img = bs.select_one('img')
        print(img.attrs)   # {'src': 'E:\\liudehua.jpg'}
        print(img.attrs['src'])   # E:\liudehua.jpg
    
        a = bs.select('a')
        print(a[-1].attrs['href'])   # www.baidu.com
    
    
        # 5.在指定标签中获取子标签
        # 标签对象.select(css选择器)——获取指定标签中选择器选中的所有标签
        # 标签对象.select_one(css选择器)——获取指定标签中选择器选中的第一个标签
        # print('所有的p标签:', bs.select('p'))
    
        div = bs.select_one('div')
        result = div.select('p')
        print('div中p标签:', result)   # div中p标签: [<p>11111</p>, <p>22222</p>, <p id="p2">文字1<b>文字2</b></p>]
    
    
    
    
    if __name__ == '__main__':
        # get_net_data('https://lib.swpu.edu.cn/')
        data = """
                    <html>
                        <head>
                            <title>The Dormouse's story</title>
                        </head>
                        <body>
                            <p id="p1" class="title" name="dromouse"><b>The Dormouse's story</b></p>
                            <p class="story">Once upon a time there were three little sisters; and their names were
                            <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
                            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
                            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
                            and they lived at the bottom of a well.</p>
                            <p class="story">...</p>
                            <div>
                                <p>11111</p>
                                <p>22222</p>
                                <a href="www.baidu.com">百度一下</a>
                                <img src="E:\liudehua.jpg">
                                <p id="p2">文字1<b>文字2</b></p>
                            </div>
    """
        analysis_data(data)
    
  4. csv文件操作

    import csv
    # ========================将文件写入csv中=======================
    # '''
    # 1.创建writer
    # csv.writer(文件对象)——以列表为单位,写入一行数据
    # csv.DicWriter——以字典为单位写入一行数据
    with open('files/test.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
    
    # 2.写入数据
        writer.writerow(['姓名', '性别', '年龄', '分数'])
        writer.writerows([
            ['张三', '男', 26, 89],
            ['小红', '女', 22, 78],
            ['李四', '男', 21, 90]
        ])
    # '''
    
    # 2.用字典提供数据
    with open('files/test2.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, ['name', 'age', 'gender', 'score'])
        # 第一行内容
        # 方式1
        writer.writerow({'name':'姓名','age':'年龄', 'gender':'性别', 'score':'分数' })   # '姓名','年龄','性别','分数'
        # 方式2
        writer.writeheader()   # 'name', 'age', 'gender', 'score'
    
        # 写一行
        writer.writerow({'name':'andy','age':18, 'gender':'男', 'score':96})
    
        writer.writerows([
            {'name':'july','age':18, 'gender':'男', 'score':96},
            {'name':'peter','age':22, 'gender':'男', 'score':88},
            {'name':'jack','age':19, 'gender':'男', 'score':90}
        ])
    
    
    # ===================读取csv文件内容=============================
    # 注意:任意一个csv文件都可以选择使用列表或者字典的方式
    # 1.一行数据对应一个列表
    with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
        # reader就是每一行内容对应的迭代器。(reader是一个迭代器,迭代器中的元素是每一行内容对应的列表)
        reader = csv.reader(f)
        next(reader)
        print(list(reader))   # [['张三', '男', '26', '89'], ['小红', 'nv', '22', '78'], ['李四', '男', '21', '90']]
    
    
    # 2.一行数据对应一个字典
    with open('files/test.csv', 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        print(next(reader))   # {'姓名': '张三', '性别': '男', '年龄': '26', '分数': '89'}
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值