某班课群网页爬取题库,自动答题(3)

最新推荐文章于 2025-06-10 20:24:41 发布

原创

最新推荐文章于 2025-06-10 20:24:41 发布 · 1.7k 阅读

15 ·

CC 4.0 BY-SA版权

文章标签：

#selenium #python #xpath

本文介绍如何利用Python的Selenium库爬取某班级课群的题库，并通过XPath解析题目，关键在于实现了避免重复抓取相同题目的策略。

此次更新的是题库不再重复爬取相同的题目

from selenium import webdriver
from lxml import etree
import requests
import time
import random
import json

from selenium.webdriver.remote.webelement import WebElement

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56 "
}
browser = webdriver.Edge(executable_path = r"C:\Users\baibe\PycharmProjects\reptile\new_reptile\MicrosoftWebDriver.exe")
# 让浏览器发起一个指定url的请求
browser.get("https://www.yooc.me/login")
# 定位标签
account_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[2]/input')
account_input.send_keys('账号')
password_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[3]/input')
password_input.send_keys('密码')
# 用page_source获取当前页面的源码数据
response = browser.page_source
tree = etree.HTML(response)
code_url = tree.xpath('/html/body/div[2]/

最低0.47元/天解锁文章

2 条评论

小鞠583 2021.12.08
from selenium import webdriver from lxml import etree import requests import time import random import json from selenium.webdriver.remote.webelement import WebElement headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56 " } browser = webdriver.Edge(executable_path = r"C:\Users\baibe\PycharmProjects\reptile\new_reptile\MicrosoftWebDriver.exe") # 让浏览器发起一个指定url的请求 browser.get("https://www.yooc.me/login") # 定位标签 account_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[2]/input') account_input.send_keys('账号') password_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[3]/input') password_input.send_keys('密码') # 用page_source获取当前页面的源码数据 response = browser.page_source tree = etree.HTML(response) code_url = tree.xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[4]/img/@src')[0] text_response = requests.get(url = code_url, headers = headers).content with open("../code_text.jpg", "wb") as fp: fp.write(text_response) code_text = input("请查看验证码,并在30秒内输入:") code_text_input = browser.find_element_by_xpath('/html/body/div[2]/div/div/div[2]/div/div[1]/form/div[4]/input[1]') # 与标签交互,输入文本 code_text_input.send_keys(code_text) login = browser.find_element_by_id('submit') login.click() time.sleep(5) topic_url = browser.find_element_by_xpath('/html/body/div[2]/div[2]/table/tbody/tr/td/div[1]/div[3]/div[1]/a[2]') topic_url.click() time.sleep(2) handles = browser.window_handles browser.switch_to.window(handles[1]) exam_url = browser.find_element_by_xpath('/html/body/section/section/div[1]/div[4]/a') exam_url.click() time.sleep(2) questions = {} for every in range(50):#循环50次 exam_detail = browser.find_element_by_xpath('/html/body/section/section/div[2]/div[3]/ul/li[1]/div[2]/a[2]') exam_detail.click() time.sleep(2) confirm_btn = browser.find_element_by_xpath('/html/body/div[12]/div[3]/div/div[1]') confirm_btn.click() time.sleep(5) while True: try: # 出现网络问题弹窗时,自动关闭,并试到成功为止 network_anomaly = browser.find_element_by_xpath('/html/body/div[12]/div[3]/div/div/button') network_anomaly.click() except: break pass else: confirm_btn = browser.find_element_by_xpath('/html/body/div[12]/div[3]/div/div[1]') confirm_btn.click() bodylist = browser.find_elements_by_xpath('/html/body/section/section/div[5]/div[@class="question-board"]') print(bodylist) browser.maximize_window() for each in bodylist: print(each.text) templist = each.find_elements_by_tag_name('label') islist = random.choice(templist) # 随机选择选项,此处稍微改一下就可以实现答题全对 while True: try: # 出现网络问题弹窗时,自动关闭,并试到成功为止 network_anomaly = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div/button') network_anomaly.click() except: break pass else: islist = random.choice(templist) islist.click() time.sleep(0.5) islist.click() time.sleep(0.5) print("选项已勾选!") print("正在交卷!") submit = browser.find_element_by_xpath('/html/body/section/aside/div[2]/div[2]/div[3]/a[2]') submit.click() time.sleep(2) confirm_again = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]') confirm_again.click() time.sleep(2) check_detail = browser.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]') check_detail.click() time.sleep(2) print("正在读取题目!") exam_answers = browser.page_source exam_answers_tree = etree.HTML(exam_answers) answers_list = exam_answers_tree.xpath('/html/body/section/section/div[3]/div[@class="question-board"]') for answers in answers_list: if answers.xpath('./@id')[0] not in questions.keys(): # id重复的题目不再爬取 questions[answers.xpath('./@id')[0]] = answers.xpath('.//text()') print(answers.xpath('./@id')[0], "读取成功!") again_btn = browser.find_element_by_xpath('/html/body/section/div/a') again_btn.click() with open(r'C:\Users\baibe\PycharmProjects\reptile\new_reptile\yiban_questionbank\bank.txt', "a+") as fb:# 这里记得改一下路径,换成你的电脑上的 for each in questions.values(): for i in each: fb.write(i) print(each, "存储成功!") print("总共抓到%s道题!" % len(questions.keys())) browser.quit()