作业:把数据整理成csv表格
import requests
from re import findall
import csv
import os
from tqdm import tqdm
from json import loads
def fake_ip(ip_num):
global Headers
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
URL = 'http://d.jghttp.alicloudecs.com/getip?num=20&type=2&pro=&city=0&yys=0&port=1&time=4&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1®ions='
resp = requests.get(url=URL, headers=Headers)
ip_data = loads(resp.text)["data"]
ip_list = [f'{x["ip"]}:{x["port"]}' for x in ip_data]
proxy = []
for x in range(ip_num):
proxy += {
'http': f'http://{ip_list[x]}',
'https': f'http://{ip_list[x]}'
}
return proxy
def get_main_url():
main_url_list = []
for x in range(0, 226, 25):
main_url_list.append(f'https://movie.douban.com/top250?start={x}&filter=')
return main_url_list
def detect_status_code(resp):
if resp.status_code == 200:
return resp.text
else:
print(resp.status_code)
def get_movies_url(main_url=['https://movie.douban.com/top250?start=0&filter=']):
all_movies_url = []
for x in main_url:
resp = requests.get(url=x, headers=Headers)
text1 = detect_status_code(resp)
re_url = '(?s)\n <a href="(.+?)">\n'
all_movies_url += findall(re_url, text1)
open('all_movies_url', 'w', encoding='utf-8').write(f'{all_movies_url}')
return all_movies_url
def origin_movie_info(single_url):
global Headers
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'
}
resp = requests.get(url=single_url, headers=Headers)
return detect_status_code(resp)
def get_target_info(text):
info2 = findall(r'(?s)\n "name": "(.+?)",.*'
'rel="v:directedBy">(.+?)</a></span></span><br/>.*'
'语言:</span> (.+?)<br/>.*'
'<span class="pl">上映日期:</span> (.+?)<br/>.*'
'<strong class="ll rating_num" property="v:average">(.+?)</strong>', text)[0]
actors_ = findall(r'主演</span>(.+?)</span></span><br/>', text)
if actors_:
actors = findall(r'rel="v:starring">(.+?)</a>', actors_[0])
else:
actors = ['无主演']
movie_date = findall('content="(.+?)">', info2[-2])
result = []
result.append(info2[0])
result.append(info2[1])
result.append(actors)
result.append(movie_date)
result.append(info2[-3])
result.append(info2[-1])
return result
if __name__ == '__main__':
if os.path.exists('all_movies_url'):
all_movies_url_list = eval(open('all_movies_url', 'r', encoding='utf-8').read())
else:
all_movies_url_list = get_movies_url()
num = 0
if os.path.exists('top250_2.csv'):
num = len(list(csv.reader(open('top250_2.csv', 'r', encoding='utf-8', newline='')))) - 1
all_movies_url_list = all_movies_url_list[num:]
f2 = open('top250_2.csv', 'a', encoding='utf-8', newline='')
else:
f2 = open('top250_2.csv', 'w', encoding='utf-8', newline='')
csv.writer(f2).writerow(['电影名', '导演', '演员', '上映时间', '语言', '评分'])
for x in tqdm(all_movies_url_list, desc=f'当前进度'):
origin_info = origin_movie_info(x)
single_movie_info = get_target_info(origin_info)
csv.writer(f2).writerow(single_movie_info)
f2.close()
1176

被折叠的 条评论
为什么被折叠?



