"""
实例:爬取豆瓣内容
并且将“专属内容”保存至“专属文件”当中
"""
import requests
import json
class DouBanSpider:
# 定义属性,基础
def __init__(self):
# 设置url地址 (字典形式,保存多个不同地址!)
# self.url = "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start={}"
self.url = [
# 美剧
{
"url_address": "https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start={}",
"country":"USA"
},
# 英剧
{
"url_address": "https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start={}",
"country": "UK"
}
]
# 设置headers
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/54.0.2840.99 Safari/537.36"
}
# 设置代理ip
self.proxies = {
"http": "http://123.232.236.235:8118"
# 也可以多个
}
# 发送请求,获取爬取的初始内容
def paser_url(self, url):
resoponse = requests.get(url, headers=self.headers, proxies=self.proxies)
print("paser_url(self)运行完毕")
print("resoponse.content.decode()的类型为:%s" % type(resoponse.content.decode()))
return resoponse.content.decode()
# 提取数据并保存
def get_content_save(self,json_str,country):
# 将Json数据转化成python数据
json_dict = json.loads(json_str)
# 保存数据
#“a“ 表示打开一个文件用于追加内容。
with open("G:/untitled/Spider_爬虫/爬虫数据收集_测试/豆瓣数据" + country + ".txt", "a", encoding="utf-8") as f:
# f.write(ret)的ret参数是字符串
# json.dumps能够把ret1-python类型转化为json字符串(并且可以格式化)
f.write(json.dumps(json_dict, ensure_ascii=False, indent=4))
def run(self): # 主要执行方法(主要实现逻辑)
num = 0
while num < 40:
# 1.start_url(页数变化)
# 循环遍历url列表中的字典
for k in self.url:
start_url = k["url_address"].format(num)
print("start_url为:%s" % start_url)
print("k = %s" % k["country"])
# 2.发送请求,获取响应,获取Json数据
josn_str = self.paser_url(start_url)
print("josn_str为:%s" % josn_str)
# 3.提取并保存数据(提取“专属信息”并保存至“专属文件”)
num_str = str(num)
# self.get_content_save(json_str=josn_str,num=num_str)
self.get_content_save(json_str=josn_str,country=k["country"])
print("*"*30)
print()
# 5.构造下一页的url地址,进入循环
num += 20
if __name__ == '__main__':
dobanSpider = DouBanSpider()
dobanSpider.run()
print("--运行完毕--")