使用selenium效率很低,而且不是通用的,爬取其他课程要对代码进行修改,建议使用下方文章:
爬取的网站不是某通原网,可能是内部网站,分析网站源码抓包发现的,此网站目前没有字体加密。
下方代码:
import time
from lxml import etree
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def xpath_click(xpath):
ec = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, xpath)))
if ec == False:
print("不可点击")
ec.click()
def close():
os.system('taskkill /im chromedriver.exe /F')
os.system('taskkill /im chrome.exe /F')
print("The program has finished running and the process has been completely closed!")
driver = webdriver.Chrome()
driver.get('https://mooc1.chaoxing.com/course/224275956.html')
xpath_click('//*[@id="ext-gen1044"]/div[4]/div/div[1]/div[1]/table/tbody/tr/td[2]/a')
tree=etree.HTML(driver.page_source)
beautiful=BeautifulSoup(driver.page_source, "lxml")
chapter=beautiful.findAll('div',class_='courselistCon')
m=len(chapter)
for k in range(0,m):
n=len(chapter[k].find_all('ul'))
xpath_click(f'//*[@id="ext-gen1044"]/div[4]/div/div[3]/div[1]/div/div[{k+1}]/div/ul[{n}]/li/a')
for h in range(0,2):
WebDriverWait(driver, 5).until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME,'iframe' )))
soup = BeautifulSoup(driver.page_source, "lxml")
单选题=soup.find_all('div', style='line-height: 35px; font-size: 14px;padding-right:15px;')
选项=soup.find_all('ul', class_='Zy_ulTop')
填空判断题=soup.find_all('div', class_='Zy_TItle_p')
with open('工程力学.txt', 'a' ,encoding='utf8') as f:
for i in range(0,len(单选题)):
f.write(单选题[i].get_text(separator=" ", strip=True)+选项[i].get_text(separator=" ", strip=True)+'\n')
for j in 填空判断题:
f.write(j.get_text(separator=" ", strip=True)+"\n")
driver.switch_to.default_content()
close()