步骤
-
拿到页面源代码
-
通过
re
来提取需要的信息import requests import re url = "https://movie.douban.com/chart" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3766.400 QQBrowser/10.6.4163.400" } resp = requests.get(url=url,headers=headers) page_content = resp.text obj = re.compile(r'<a class="nbg".*?<a href="https://movie.douban.com/subject/.*?/" class="">(?P<name>.*?)/ <span style="font-size:13px;">.*?',re.S) result = obj.finditer(page_content) for i in result: print(i.group("name").strip())
-
处理成
csv
文件import requests import re import csv url = "https://movie.douban.com/chart" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3766.400 QQBrowser/10.6.4163.400" } resp = requests.get(url=url,headers=headers) page_content = resp.text obj = re.compile(r'<a class="nbg".*?<a href="https://movie.douban.com/subject/.*?/" class="">(?P<name>.*?)/ <span style="font-size:13px;">.*?',re.S) result = obj.finditer(page_content) f = open("data.csv",mode="w",encoding="Utf-8") csvwriter = csv.writer(f) for i in result: dic = i.groupdict() dic['name'] = dic['name'].strip() csvwriter.writerow(dic.values()) f.close() print("over")
-
利用循环语句获取分页查询的数据
import requests import re import csv page = 0 for i in range(10): url = "https://movie.douban.com/top250?start="+str(page)+"&filter=" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3766.400 QQBrowser/10.6.4163.400" } param = { "sv": "200", "tid": "gda", "tv": "r20210630", "st": "env" } resp = requests.get(url=url,params=param,headers=headers) page_content = resp.text obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?<span class="inq">(?P<evaluate>.*?)</span>',re.S) result = obj.finditer(page_content) f = open("data.csv",mode="a",encoding="Utf-8") csvwriter = csv.writer(f) for i in result: dic = i.groupdict() dic['evaluate'] = dic['evaluate'].strip() csvwriter.writerow(dic.values()) print(page) print(url) page += 25 f.close() print("over")