【作业】2022.5.10 把数据整理成csv表格

作业:把数据整理成csv表格

import requests
from re import findall
import csv
import os
from tqdm import tqdm
from json import loads


def fake_ip(ip_num):
    global Headers
    Headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
    }
    URL = 'http://d.jghttp.alicloudecs.com/getip?num=20&type=2&pro=&city=0&yys=0&port=1&time=4&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1&regions='
    resp = requests.get(url=URL, headers=Headers)
    ip_data = loads(resp.text)["data"]
    ip_list = [f'{x["ip"]}:{x["port"]}' for x in ip_data]

    proxy = []
    for x in range(ip_num):
        proxy += {
            'http': f'http://{ip_list[x]}',
            'https': f'http://{ip_list[x]}'
        }
    return proxy


def get_main_url():
    main_url_list = []
    for x in range(0, 226, 25):
        main_url_list.append(f'https://movie.douban.com/top250?start={x}&filter=')

    return main_url_list


def detect_status_code(resp):
    if resp.status_code == 200:
        return resp.text
    else:
        print(resp.status_code)


def get_movies_url(main_url=['https://movie.douban.com/top250?start=0&filter=']):
    all_movies_url = []
    for x in main_url:
        resp = requests.get(url=x, headers=Headers)
        text1 = detect_status_code(resp)
        re_url = '(?s)\n                    <a href="(.+?)">\n'
        all_movies_url += findall(re_url, text1)

    open('all_movies_url', 'w', encoding='utf-8').write(f'{all_movies_url}')
    return all_movies_url


def origin_movie_info(single_url):
    global Headers
    Headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
    }
    resp = requests.get(url=single_url, headers=Headers)
    return detect_status_code(resp)


def get_target_info(text):
    info2 = findall(r'(?s)\n  "name": "(.+?)",.*'
                    'rel="v:directedBy">(.+?)</a></span></span><br/>.*'
                    '语言:</span> (.+?)<br/>.*'
                    '<span class="pl">上映日期:</span> (.+?)<br/>.*'
                    '<strong class="ll rating_num" property="v:average">(.+?)</strong>', text)[0]

    actors_ = findall(r'主演</span>(.+?)</span></span><br/>', text)

    if actors_:
        actors = findall(r'rel="v:starring">(.+?)</a>', actors_[0])
    else:
        actors = ['无主演']

    movie_date = findall('content="(.+?)">', info2[-2])

    result = []
    result.append(info2[0])
    result.append(info2[1])
    result.append(actors)
    result.append(movie_date)
    result.append(info2[-3])
    result.append(info2[-1])

    return result


if __name__ == '__main__':
    if os.path.exists('all_movies_url'):
        all_movies_url_list = eval(open('all_movies_url', 'r', encoding='utf-8').read())
    else:
        all_movies_url_list = get_movies_url()

    num = 0

    if os.path.exists('top250_2.csv'):
        num = len(list(csv.reader(open('top250_2.csv', 'r', encoding='utf-8', newline='')))) - 1
        all_movies_url_list = all_movies_url_list[num:]
        f2 = open('top250_2.csv', 'a', encoding='utf-8', newline='')
    else:
        f2 = open('top250_2.csv', 'w', encoding='utf-8', newline='')
        csv.writer(f2).writerow(['电影名', '导演', '演员', '上映时间', '语言', '评分'])

    for x in tqdm(all_movies_url_list, desc=f'当前进度'):
        origin_info = origin_movie_info(x)
        single_movie_info = get_target_info(origin_info)
        csv.writer(f2).writerow(single_movie_info)

    f2.close()
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Sprite.Nym

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值