练手(xpath和selenium的使用)
抓取每个主播的分类,id,标题,人气
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
class DouyuSpider:
def __init__(self):
self.start_url = "https://www.douyu.com/directory/all"
chrome_options = Options()
chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(chrome_options=chrome_options)
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath("//ul[@class='layout-Cover-list']/li") # 分组,使用elements
content_list = []
for li in li_list:
item = {}
item["room_classification"] = li.find_element_by_xpath(".//div[@class='DyListCover-info']/span[@class='DyListCover-zone']").text
item["room_Anchor"] = li.find_element_by_xpath(".//div[@class='DyListCover-info']/h2").text
item["room_title"] = li.find_element_by_xpath(".//div[@class='DyListCover-info']/h3").get_attribute("title")
item["room_popularity"] = li.find_element_by_xpath(".//div[@class='DyListCover-info']/span[@class='DyListCover-hot']").text
print(item)
content_list.append(item)
# 获取下一页的元素
next_url = self.driver.find_elements_by_xpath(".//span[@class='dy-Pagination-item-custom']")
next_url = next_url[0] if len(next_url) > 0 else None
return content_list, next_url
def save_content_list(self, content_list):
pass
def run(self): # 实现主要逻辑
# 1.start_url
# 2.发送请求,获取响应
self.driver.get(self.start_url)
# 3.提取数据,提取下一页的元素
content_list, next_url = self.get_content_list()
# 4.保存数据
self.save_content_list(content_list)
# 5.点击下一页元素,循环
while next_url is not None:
next_url.click()
# time.sleep(3) 网络延迟时使用
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
self.driver.close()
if __name__ == '__main__':
douyu = DouyuSpider()
douyu.run()
效果图: