python—常用的库-优快云博客

解析HTML 、XML、HTML5 【beautifulsoup4】

pip install beautifulsoup4

pip install lxml

from bs4 import BeautifulSoup

 解析数据
def parse_result(soup):
    movie_list_this_page = soup.find('ol', class_='grid_view').find_all('li')
    for item in movie_list_this_page:
        ranking = item.find('em').get_text()
        name = item.find(class_='title').string
        img = item.find('a').find('img').get('src')
        score = item.find(class_='rating_num').string
        author = item.find('p', class_='').get_text(strip=True)
        if item.find(class_='inq') is not None:
            desc = item.find(class_='inq').string
        else:
            desc = '暂无'

        print(ranking, name, img, score, author, desc)
        yield {
            'ranking': ranking,
            'name': name,
            'img': img,
            'score': score,
            'author': author,
            'desc': desc,
        }

http请求

requests

response = requests.get(url)

# 请求获取网页数据
def request_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
        'cookie': 'bid=dT8Z3OE5_cY; _pk_id.100001.4cf6=97307cec25d927ab.1727414571.; __utmz=30149280.1727414571.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1727414571.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=kz8b5hlKFkxH8Y9DITjOMuxWgBYikz0h; ll="108296"; _vwo_uuid_v2=D5055D26948C52E0832B26F1769798A7F|836a6dfe85637bb8c39462e0dadf8747; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1074059804.1727414571.1728178377.1728193731.4; __utmb=30149280.0.10.1728193731; __utma=223695111.688962959.1727414571.1728178377.1728193731.4; __utmb=223695111.0.10.1728193731'
    }
    try:
        response = requests.get(url=url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except requests.RequestException:
        return None

json处理

import json

对象转化为 json：json.dumps()

json转化为对象：json.loads()

文件写入

with open('dangdang_top_500_book.txt', 'a', encoding='UTF-8') as f:
    f.write(json.dumps(item, ensure_ascii=False) + '\n')
    f.close()

Excel写入

import xlwt

# 数据写入Excel
def save_to_excel(result_list):
    if result_list is None:
        return
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
    sheet.write(0, 0, '排名')
    sheet.write(0, 1, '电影名称')
    sheet.write(0, 2, '图片')
    sheet.write(0, 3, '评分')
    sheet.write(0, 4, '作者')
    sheet.write(0, 5, '简介')

    for i, item in enumerate(result_list):
        row = i + 1
        sheet.write(row, 0, item['ranking'])
        sheet.write(row, 1, item['name'])
        sheet.write(row, 2, item['img'])
        sheet.write(row, 3, item['score'])
        sheet.write(row, 4, item['author'])
        sheet.write(row, 5, item['desc'])

    book.save('豆瓣电影Top250.xls')

Redis

pip install redis