Python爬虫入门（四）re库及某宝实例

最新推荐文章于 2022-10-22 10:37:53 发布

咸鱼hao

最新推荐文章于 2022-10-22 10:37:53 发布

阅读量408

点赞数

CC 4.0 BY-SA版权

分类专栏： Python 文章标签： Python re requests

本文链接：https://blog.youkuaiyun.com/weixin_43868436/article/details/89706745

Python 专栏收录该内容

13 篇文章

订阅专栏

一、re库

import re

# re库基本使用
# 原生字符串类型，不包含转移符 raw string


def test_regex():
    # regex库常用的6个方法
    # search 返回第一个匹配
    match = re.search(r'[1-9]\d{5}', 'BIT 100081 100081')
    if match:
        print(match.group(0))
    # 从起始位置开始匹配
    match = re.match(r'[1-9]\d{5}', '100081 BIT 100081')
    if match:
        print(match.group(0))
    # 以列表返回所有匹配的字符串
    ls = re.findall(r'[1-9]\d{5}', 'BIT100081 TSU100084')
    print(ls)
    # 将匹配的去除，返回列表
    ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084')
    print(ls)
    ls = re.split(r'[1-9]\d{5}', 'BIT100081 TSU100084', maxsplit=1)
    print(ls)
    # 返回迭代类型，每个迭代元素是match对象
    for m in re.finditer(r'[1-9]\d{5}', 'BIT100081 TSU100084'):
        if m:
            print(m.group(0))
    # 替换匹配到的字符串，并返回
    t = re.sub(r'[1-9]\d{5}', ':zipcode', 'BIT100081 TSU100084')
    print(t)
    # 编译生成正则表达式对象 regex = re.compile


# test_regex()


def test_match_object():
    # match对象属性
    m = re.search(r'[1-9]\d{5}', 'BIT100081 TSU100084')
    # 返回一次匹配结果
    print(m.string)
    print(m.re)
    print(m.pos)
    print(m.endpos)
    print(m.group(0))   # 匹配得到的内容
    print(m.start())
    print(m.end())
    print(m.span())


# test_match_object()


def greed_and_min_match():
    # 默认最长匹配即贪婪匹配
    m = re.search(r'py.*n', 'pynabncdenfn')
    print(m.group())
    # 加上？可实现最小匹配
    m = re.search(r'py.*?n', 'pynabncdenfn')
    print(m.group(0))


greed_and_min_match()

二、淘宝定向爬虫实例

import requests
import re

# 淘宝搜索定向爬虫
# 学会正则表达式
# cookies和ip问题


def find_html(url, kv, cookies):
    try:
        r = requests.get(url, cookies=cookies, headers=kv, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parse_page(ilt, html):
    try:
        #  \: \.分别为:和.的转义
        # plt tlt分别为一个网页上商品价格、名称的列表
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)  # "view_price":"数字或者.出现多次"
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)  # "raw_title":"任意字符的最小匹配"
        for i in range(len(plt)):
            #  eval去除"或'
            #  eval() 函数用来执行一个字符串表达式，并返回表达式的值。
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("shit")


def print_good_list(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))


def main():
    goods = '铅笔'
    depth = 3
    start_url = 'https://s.taobao.com/search?&q=' + goods
    infoList = []
    coo = ''   # 这里填写自己淘宝网站的cookie哦
    cookies = {}
    for line in coo.split(';'):
        # line.strip为xxx=xxx形式
        name, value = line.strip().split('=', 1)
        cookies[name] = value
    kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'}
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = find_html(url, kv, cookies)
            if i == 0:
                print(html)
            parse_page(infoList, html)
        except:
            continue
    print_good_list(infoList)


main()