#coding=utf-8
from selenium import webdriver
import time
import xlwt
from bs4 import BeautifulSoup
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('mysheet',cell_overwrite_ok=True)
from openpyxl import Workbook
# browser=webdriver.Chrome()
# browser.get('https://www.jiandaoyun.com/app/5df9ec636c793000064c6868/entry/5df9ed16073bbd00067f1d54')
#
# browser.close()
from lxml import etree
chromeOptions= webdriver.ChromeOptions()
chromeOptions.add_argument(r"user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome\User Data1")
chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) # 防止网站发现我们使用模拟器
browser = webdriver.Chrome(options=chromeOptions,executable_path="E:\YANG_WORK\package_some\chromedriver.exe")
browser.get('https://www.jiandaoyun.com/app/5df9ec636c793000064c6868/entry/5df9ed16073bbd00067f1d54')
def page(i):
time.sleep(10)
b = browser.page_source
soup=BeautifulSoup(b,"html.parser") #html.parser解析器 lxml解析器 xml解析器
tb=soup.tbody
print("hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")
# print(tb)
print(type(tb))
y = i
for td in tb.find_all('tr'):
x = 0
print('a标签中的href属性是:', td.find_all('img'))
img=td.find_all('img')
src0 = img[0].get('src')
for td1 in td.find_all('td'):
print('td的值是:', td1.string)
sheet.write(y, x, td1.string)
sheet.write(y, 5, src0)
try:
src1 = img[1].get('src')
sheet.write(y, 6, src1)
except:
pass
try:
src2 = img[2].get('src')
sheet.write(y, 7, src2)
except:
pass
# print('t的类型是:', type(td1))
x+=1
y+=1
book.save('test5.xls')
page(1)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(100)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(200)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(300)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(400)
browser.find_element_by_xpath('//button[@class="page-btn next"]').click()
page(500)