import re
import pandas as pds
import numpy
import urllib.request
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome() #驱动谷歌浏览器
def enter(url,element):
wait = WebDriverWait(browser, 2)
try:
browser.get(url)
wait.until(
EC.presence_of_element_located((By.XPATH,element)),
)
except TimeoutException:
result = "在"+url+'\n'+'未定位到'+element
print(result)
def get_detail(element):
try:
elements = browser.find_element_by_xpath(element)
except TimeoutException:
elements.text ="None"
return elements.text
def get_element_attribute(element, attribute):
elements = browser.find_element_by_xpath(element)
return elements.get_attribute(attribute)
'''
def get_ele_num(element):
num_list = []
elements = browser.find_elements_by_xpath(element)
for eachone in elements:
num_list.append(eachone.text)
return len(num_list)
def get_one_url(urls,titles,num):
for i in range(1,num):
element = "/html/body/table/tbody/tr/td/table[3]/tbody/tr/td[1]/table[3]/tbody/tr["+str(i)+"]/td[2]/a"
href = get_element_attribute(element, "href")
urls.append(href)
title = get_detail(element)
titles.append(title)
return urls,titles
'''
def xes_detail(message):
elements = browser.find_elements_by_xpath('//div[@class="s-r-list"]')
ele_lenth = len(elements)
for i in range(1,ele_lenth+1):
id_ele = '//div[@class="s-r-list"]['+str(i)+"]"
get_id = get_element_attribute(id_ele,'id')
course = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]/h3/a'
get_course = get_detail(course)
teacher_link = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-photo"]/a'
get_teacher_link = get_element_attribute(teacher_link,'href')
teacher = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-photo"]/p/a'
get_teacher= get_detail(teacher)
state = '//div[@class="s-r-list"]['+str(i)+']//p[@class="mtop20"]'
get_state = get_detail(state)
price = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]/div[@class="price"]'
get_price = get_detail(price)
total = '//div[@class="s-r-list"]['+str(i)+']/div[@class="s-r-list-detail"]/div[@class="s-r-list-info"]'
total = get_detail(total)
subject = re.compile("学科:(.*?)年级").findall(total)
grade = re.compile("年级:(.*?)\n").findall(total)
begin_date =re.compile("开课日期:(.*?)上课时间").findall(total)
lesson_date =re.compile("上课时间:(.*?)\n").findall(total)
address = re.compile("上课地点:(.*?)\n").findall(total)
tutor = re.compile("辅导老师:(.*?)\n").findall(total)
message.append(get_id)
message.append(get_course)
message.append(subject[0])
message.append(grade[0])
message.append(get_state)
message.append(get_price)
message.append(get_teacher)
message.append(tutor[0])
message.append(address)
message.append(begin_date[0])
message.append(lesson_date[0])
message.append(get_teacher_link)
course_url = '//div[@class="s-r-list"]['+str(i)+"]//h3/a"
get_course_url = get_element_attribute(course_url,'href')
enter(get_course_url, '//p[@class="T_list_tion"]')
print(get_course)
if re.compile("在线课堂").findall(get_course):
suit_crowds_ele = '//div[@class="detail-content"][1]//li'
train_obj_ele = '//div[@class="detail-content"][2]//li'
course_table_ele = '//div[@class="course-list"]/div[@class="detail-content"]//p'
get_suit_crowds= get_detail(suit_crowds_ele)
get_train_obj= get_detail(train_obj_ele)
get_course_table= get_detail(course_table_ele)
print(get_suit_crowds,get_train_obj,get_course_table)
message.append(get_suit_crowds)
message.append(get_train_obj)
message.append(get_course_table)
else:
course_content = []
course_table_ele = '//li[@class="t_ligreen"]'
course_table_eles = browser.find_elements_by_xpath(course_table_ele)
for course_table_ele in course_table_eles:
course_content .append(course_table_ele.text)
print(course_content)
message.append(course_content)
message.append("None")
message.append("None")
browser.back()
return message
def main():
urls = []
titles = []
ele = '//*[@id="search-bar"]/ul/li[1]/span'
list1 = []
for i in range(-8,-5):
list1.append(i)
for j in range(1,13):
list1.append(j)
list1.append(15)
message=[]
for k in list1: #所有年级
url = "http://sxa.speiyou.com/search/index/subject:/grade:"+str(k)+"/level:bx/lesson:/term:/gtype:time"
enter(url, ele)
grade = get_detail(ele)
ele_totalpage = '//div[@class="pagination mtop40"]'
tot = get_detail(ele_totalpage)
total_page = re.compile("当前第1/(.*?)页").findall(tot)
for i in range(1,int(total_page[0])+1): #每一年级的课程总页数
url ="http://sxa.speiyou.com/search/index/gtype:time/grade:"+str(k)+"/subject:/level:bx/lesson:/term:/period:/teaid:/m:/d:/time:/bg:n/nu:/service:/curpage:"+str(i)
enter(url,ele)
print(url)
message = xes_detail(message)
message = numpy.array(message).reshape(-1,15)
df = pds.DataFrame(message)
df.to_csv('C:/Users/Administrator/Desktop/xes_test.csv', sep=',', mode='a',index = False,header = False)
browser.close() #关闭浏览器
if __name__ == "__main__":
main()