Python——selenium(XESPC)

本文详细介绍了使用Selenium进行网页自动化测试的方法,包括初始化浏览器驱动、定位页面元素、获取元素属性及文本信息等核心操作。通过实例展示了如何遍历页面元素、抓取课程详情,并将数据整理为CSV文件。适用于自动化测试、Selenium、ChromeDriver等领域初学者。
import re
import pandas  as pds
import numpy
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()  #驱动谷歌浏览器

def enter(url,element):
     wait = WebDriverWait(browser, 2)
     try:
         browser.get(url)
         wait.until(
             EC.presence_of_element_located((By.XPATH,element)),
         )
     except TimeoutException:
          result = "在"+url+'\n'+'未定位到'+element
          print(result)


def get_detail(element):
    try:
        elements = browser.find_element_by_xpath(element)
    except TimeoutException:
        elements.text ="None"
    return elements.text

def get_element_attribute(element, attribute):
    elements = browser.find_element_by_xpath(element)
    return elements.get_attribute(attribute)


'''
def get_ele_num(element):
     num_list = []
     elements = browser.find_elements_by_xpath(element)
     for eachone in elements:
          num_list.append(eachone.text)
     return len(num_list)
     

def get_one_url(urls,titles,num):
     for i in range(1,num):
          element = "/html/body/table/tbody/tr/td/table[3]/tbody/tr/td[1]/table[3]/tbody/tr["+str(i)+"]/td[2]/a"
          href  = get_element_attribute(element, "href")
          urls.append(href)
          title = get_detail(element)
          titles.append(title)
     return urls,titles


'''

def  xes_detail(message):
    elements = browser.find_elements_by_xpath('//div[@class="s-r-list"]')
    ele_lenth = len(elements)
    
    for i in range(1,ele_lenth+1):
        id_ele = '//div[@class="s-r-list"]['+str(i)+"]"
        get_id = get_element_attribute(id_ele,'id')

        course = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]/h3/a'
        get_course = get_detail(course)

        teacher_link = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-photo"]/a'
        get_teacher_link = get_element_attribute(teacher_link,'href')

        teacher = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-photo"]/p/a'
        get_teacher= get_detail(teacher)

        state = '//div[@class="s-r-list"]['+str(i)+']//p[@class="mtop20"]'
        get_state = get_detail(state)
        
        price = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]/div[@class="price"]'
        get_price = get_detail(price)
        
        total = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]'
        total = get_detail(total)

        subject = re.compile("学科:(.*?)年级").findall(total)
        grade = re.compile("年级:(.*?)\n").findall(total)
        begin_date =re.compile("开课日期:(.*?)上课时间").findall(total)
        lesson_date =re.compile("上课时间:(.*?)\n").findall(total)
        address = re.compile("上课地点:(.*?)\n").findall(total)
        tutor = re.compile("辅导老师:(.*?)\n").findall(total)

        message.append(get_id)
        message.append(get_course)
        message.append(subject[0])
        message.append(grade[0])
        message.append(get_state)
        message.append(get_price)
        message.append(get_teacher)
        message.append(tutor[0])
        message.append(address)
        message.append(begin_date[0])
        message.append(lesson_date[0])
        message.append(get_teacher_link)

        course_url = '//div[@class="s-r-list"]['+str(i)+"]//h3/a"
        get_course_url = get_element_attribute(course_url,'href')
        
        enter(get_course_url, '//p[@class="T_list_tion"]')
        print(get_course)
        if re.compile("在线课堂").findall(get_course):
               suit_crowds_ele = '//div[@class="detail-content"][1]//li'
               train_obj_ele = '//div[@class="detail-content"][2]//li'
               course_table_ele = '//div[@class="course-list"]/div[@class="detail-content"]//p'
               
               get_suit_crowds= get_detail(suit_crowds_ele)
               get_train_obj= get_detail(train_obj_ele)
               get_course_table= get_detail(course_table_ele)
               print(get_suit_crowds,get_train_obj,get_course_table)
               message.append(get_suit_crowds)
               message.append(get_train_obj)
               message.append(get_course_table)
        else:
               course_content = []
               course_table_ele = '//li[@class="t_ligreen"]'
               course_table_eles = browser.find_elements_by_xpath(course_table_ele)
               for course_table_ele in course_table_eles:
                    course_content .append(course_table_ele.text)
               print(course_content)
               message.append(course_content)
               message.append("None")
               message.append("None")
        browser.back()


    return message


    
def main():
    urls = []
    titles = []
    ele = '//*[@id="search-bar"]/ul/li[1]/span'

    list1 = []
    for i in range(-8,-5):
        list1.append(i)
    for j in range(1,13):
         list1.append(j)
    list1.append(15)


    message=[]
    for k in list1:   #所有年级
        url = "http://sxa.speiyou.com/search/index/subject:/grade:"+str(k)+"/level:bx/lesson:/term:/gtype:time"
        enter(url, ele)
        grade = get_detail(ele)

        ele_totalpage = '//div[@class="pagination mtop40"]'
        tot = get_detail(ele_totalpage)
        total_page = re.compile("当前第1/(.*?)页").findall(tot)
        
    
        for i in range(1,int(total_page[0])+1):   #每一年级的课程总页数
            url ="http://sxa.speiyou.com/search/index/gtype:time/grade:"+str(k)+"/subject:/level:bx/lesson:/term:/period:/teaid:/m:/d:/time:/bg:n/nu:/service:/curpage:"+str(i)
            enter(url,ele)
            print(url)
            message = xes_detail(message)
            
    message = numpy.array(message).reshape(-1,15)
    df = pds.DataFrame(message)
    df.to_csv('C:/Users/Administrator/Desktop/xes_test.csv', sep=',', mode='a',index = False,header = False)

    browser.close()    #关闭浏览器
    

if __name__ ==  "__main__":
     main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值