# douban #!/usr/bin/env python # -*- coding:utf-8 -*- import time, re, json, requests from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from multiprocessing import Pool from PIL import Image from io import BytesIO class DouBan(object): # options = webdriver.ChromeOptions() # options.headless = True def __init__(self): self.start_url = "https://movie.douban.com/" self.list_driver = webdriver.Firefox() self.detail_driver = webdriver.Firefox() def start(self, url): self.list_driver.get(url) a = WebDriverWait(self.list_driver, 15).until(lambda list_driver: list_driver.find_element_by_link_text('选电影')) a.click() aa = WebDriverWait(self.list_driver, 15).until(lambda driver: driver.find_elements_by_tag_name('label')[5]) aa.click() self.get_list_url() def get_list_url(self): self.list_driver.implicitly_wait(20) a_list = self.list_driver.find_elements_by_class_name('item') for a in a_list: href = a.get_attribute('href') self.get_detail(href) def get_detail(self, url): self.detail_driver.get(url) self.detail_driver.implicitly_wait(20) m_name = self.detail_driver.find_element_by_css_selector('h1 span:first-child').text str = self.detail_driver.find_element_by_xpath('//div[@id="info"]').text m_tuple = re.search(re.compile(r'导演: (.*?)\n编剧: (.*?)\n主演: (.*?)\n类型: (.*?)\n.*?制片国家/地区: (.*?)\n语言: (.*?)\n上映日期: (.*?)\n片长: (.*?)\n', re.S), str).groups() print(m_name, m_tuple) def login(self, url): self.list_driver.get(url) s_input = self.list_driver.find_element_by_id('email') s_input.send_keys('13526080969') p_input = self.list_driver.find_element_by_id('password') p_input.send_keys('bas429xxx') c_input = self.list_driver.find_element_by_id('captcha_field') c_url = self.list_driver.find_element_by_id('captcha_image').get_attribute('src') captcha = self.get_captcha(c_url) c_input.send_keys(captcha) button = self.list_driver.find_element_by_class_name('btn-submit') button.click() def get_captcha(self, captcha_url): response = requests.get(captcha_url) im = BytesIO(response.content) image = Image.open(im) image.show() captcha = input('请输入验证码:') return captcha if __name__ == '__main__': douban = DouBan() douban.login('https://accounts.douban.com/login?source=movie') douban.start(douban.start_url) # 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=0' # 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start=20'