- 在学习了之前的内容后, 可以尝试做一个贴吧爬虫的案例, 要求为:
爬取指定贴吧中所有列表页面及详情页面的图片及视频,以下为详细代码。
import requests
from lxml import etree
import json
import re
from urllib import parse
class TiebaSpider:
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.start_url = "https://tieba.baidu.com/f?kw={}".format(tieba_name)
self.headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
self.page_num = 0
self.total_page = 1
def parse_url(self, url):
"""发送请求,获取响应"""
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, html_str):
"""获取列表页内容"""
html = etree.HTML(html_str)
div_list = html.xpath("//li[@class='tl_shadow tl_shadow_new ']")
content_list = []
for div in div_list:
item = {}
a = div.xpath("./a/div[contains(@class, 'ti_title')]/span/text()")
if len(a) == 1:
item['title'] = a[0]
elif len(a) == 2:
item['title'] = "[" + a[0] + "]" + a[1]
else:
item['title'] = None
item['href'] = div.xpath("./a/@href")[0] if len("./a/@href")>0 else None
content_list.append(item)
p = re.compile(r""""total_page":(\d+)""")
total_page = p.findall(str(html_str))
return content_list, int(total_page[0])
def get_image_list(self, detail_url):
"""获取帖子中所有图片、视频"""
img_list = []
detail_html_str = self.parse_url("https://tieba.baidu.com" + detail_url)
detail_html = etree.HTML(detail_html_str)
if len(detail_html.xpath("//div[@class='pb_img_item']/@data-url"))>0:
subject_img_url = detail_html.xpath("//div[@class='pb_img_item']/@data-url")
for subject_img in subject_img_url:
ret0 = re.findall("src=(.+)", subject_img)
img_list.append(parse.unquote(ret0[0]))
if len(detail_html.xpath("//div[@lz='1']/a/@data-vhsrc")) > 0:
subject_mp4_url = detail_html.xpath("//div[@lz='1']/a/@data-vhsrc")
img_list.append(subject_mp4_url)
if len(detail_html.xpath("//div[@data-class='BDE_Image']/@data-url"))>0:
img_original_list = detail_html.xpath("//div[@data-class='BDE_Image']/@data-url")
for item in img_original_list:
ret = re.findall("src=(.+)", item)
img_list.append(parse.unquote(ret[0]))
return img_list
def save_content_list(self, content_list):
"""保存数据"""
file_path = self.tieba_name + ".txt"
with open(file_path, "a") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False, indent=2))
f.write("\n")
def save_img(self, total_image_list):
for image_list in total_image_list:
for img_url in image_list:
try:
img_file_name = re.findall(r"(\w+\.jpg)", img_url)
img_html = requests.get(img_url, headers=self.headers)
img = img_html.content
with open("./img/" + img_file_name[0], 'wb') as f:
f.write(img)
except Exception:
img_file_name = re.findall(r"(\w+\.mp4)", img_url[0])
img_html = requests.get(img_url[0], headers=self.headers)
img = img_html.content
with open("./img/" + img_file_name[0], 'wb') as f:
f.write(img)
def run(self):
print(self.start_url)
next_url = self.start_url
while self.page_num < self.total_page*30:
html_str = self.parse_url(next_url)
content_list, self.total_page = self.get_content_list(html_str)
print(content_list)
total_image_list = []
for content in content_list:
image_list = self.get_image_list(content["href"])
total_image_list.append(image_list)
print(total_image_list)
self.save_img(total_image_list)
self.page_num += 30
next_url = "https://tieba.baidu.com/f?kw={}&pn={}".format(self.tieba_name, self.page_num)
print(next_url)
if self.page_num > self.total_page*30:
break
if __name__ == '__main__':
spider = TiebaSpider('中国好学姐')
spider.run()