简单的python爬虫方法（1）

最新推荐文章于 2023-09-05 10:53:45 发布

原创最新推荐文章于 2023-09-05 10:53:45 发布 · 419 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫 #开发语言

该代码示例使用Python的urllib和gzip库进行网络请求和gzip压缩数据的处理，然后用BeautifulSoup解析HTML，通过正则表达式提取图片链接，最终可以保存图片。程序旨在演示网页图片的抓取流程。

import urllib.request, urllib.error

import gzip
def ungzip(data):
    try:
        data=gzip.decompress(data)
    except:
        pass
    return data

def obtainHtml(url):
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
        'Cookie': '  ',  # 这里看你们的浏览器里有没有，你先登录百度账号，才会有的
        'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9'
        }

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = ungzip(response.read()).decode("utf-8")
        #html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

from bs4 import BeautifulSoup
import re
findImg = re.compile(r'<balanceWeightImg .*?src="(.*?)".*?>', re.S)# 制定规则  ()一个组  .*表示0个或多个字符   这种概念有一个或0个
def getData(html):
    soup = BeautifulSoup(html, "html.parser")
    #find_all（tag，attributes，recurisive， text， limit， keywords）
    #tag，即标签名，可以寻找单个标签find_all（'h1'），也可以寻找一堆标签find_all（['h1','h2','h3']）
    items = soup.find_all("div", {'id': 'wrapper'})
    print(items)
    data = []
    for item in items:
        item = str(item)
        ImgSrc = re.findall(findImg, item)
        data.append(ImgSrc)

    strChange = re.compile(r".*?//(.*?)'.*?")
    i = 0
    for ImgSrc in data:
        if len(ImgSrc) != 0:
            ImgSrc = str(ImgSrc[0])
            ImgSrc = re.findall(strChange, ImgSrc)[0]
            #print(ImgSrc)
            # 保存图片
           # response = urllib.request.urlopen(ImgSrc)
            #with open("balanceWeightImg" + str(i), "wb") as f:
               # f.write(response.read())

if __name__ == "__main__":
    url = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C6%BD%BA%E2%C5%E4%D6%D8%BF%E9%B1%EA%CA%B6&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MCwzLDUsMiwxLDYsNCw3LDgsOQ%3D%3D"
    html = obtainHtml(url)
    #print(html)
    getData(html)