基于python,使用selenium驱动Phantomjs,使用会员账号密码,模拟登陆某课程网站,根据课程类别检索课程,找到课程文档,使用Phantomjs进行截图或者保存为pdf。
关于cookie: http://blog.youkuaiyun.com/falseen/article/details/46962011import requests
import time
import os
from lxml import etree
from selenium import webdriver
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver=webdriver.PhantomJS()#使用selenium驱动PhantomJS无头浏览器
driver.get('http://www._thetargetnet_.com/login')
account=driver.find_element_by_name('login')
account.send_keys('your_account')
password=driver.find_element_by_name('password')
password.send_keys('your_password')
password.send_keys(Keys.RETURN)
cookies={}
raw_cookie='copy the cookies here after you login in'#cookie 需要登录之后手动保存
for line in raw_cookie.split(';'):#把cookie转换成字典格式
key,value=line.split('=',1)
cookies[key]=value
cookies_new=requests.utils.cookiejar_from_dict(cookies,cookiejar=None,overwrite=True)#cookie从字典格式转换成cookiejar格式,才可以被session采用
s=requests.Session()#使用session,使得若干次请求可以共享cookie
s.cookies=cookies_new
def download(driver,target_path):#定义函数,将页面保存为pdf格式(来自stackoverflow)
def execute(script,args):
driver.execute('executePhantomScript',{'script':script,'args':args})
driver.command_executor._commands['executePhantomScript']=('POST','/session/$sessionId/phantom/execute')
page_format='this.paperSize={format:"A4",orientation:"portrait"};'
execute(page_format,[])
render='''this.render("{}")'''.format(target_path)
execute(render,[])
def parse_class_page(tag,course_num):
class_url='https://www.targetnet.com'+course_num
class_page=s.get(class_url)
class_page=etree.HTML(class_page.text)
course_cat=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[2]/text()')[0]
if course_cat=="训练营":
print('this is 训练营,需要专门购买')
return
course_name=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[1]/text()')[0]
print('the course dealing with:{}'.format(course_name))
path='./{}/{}'.format(tag,'free_'+course_name)
isExist= os.path.exists(path)
if not isExist:
os.makedirs(path)
try:
document_num=class_page.xpath('//*[@id="reports"]/span[@class="lab-id"]/@data-lab-id')
length=len(document_num)
count=1
print('there are {} document in this course'.format(length))
for i in document_num:
document_url=class_url+'/labs/'+i+'/document'
driver.get(document_url)
time.sleep(3)
driver.save_screenshot('{}/{}of{}.png'.format(path,count,length))#保存为图片版本
download(driver,'{}/{}of{}.pdf'.format(path,count,length))#保存为pdf版本
print('{} page of this class is saved'.format(count))
count+=1
time.sleep(1)
except:
pass
def class_per_page(tag,page):
url_base='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page={}'#付费课程
#url_base='https://www.target.com/courses/?category=all&course_type=all&fee=free&tag={}&page={}'#免费课程
url_this_page=url_base.format(tag,page)
this_page=s.get(url_this_page)
this_page=etree.HTML(this_page.text)
class_link=this_page.xpath('/html/body/div[3]/div/div/div[2]/div[3]/div/a/@href')
print('dealing with page {} of {}:'.format(page,tag))
for j in class_link:
parse_class_page(tag,j)
time.sleep(1)
def all_class(tag):
tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page=1'.format(tag)#付费
#tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=free&tag={}&page=1'.format(tag)#免费
print('page url of {} is {}'.format(tag,tag_url))
tag_page=s.get(tag_url)
tag_page=etree.HTML(tag_page.text)
try:
page_num=int(tag_page.xpath('//html/body/div[3]/div/div/div[2]/nav/ul/li/a/text()')[-3])
except:
page_num=1
print('there are {} pages in {}'.format(page_num,tag))
for i in range (1,page_num+1):
i=str(i)
class_per_page(tag,i)
tag_list=[]
while 1:
tag=input('需要下载的课程类别(输入‘q’表示结束):')
if tag=='q':
break
tag_list.append(tag)
for tag in tag_list:
isExist=os.path.exists(tag)
if not isExist:
os.makedirs(tag)
print('directory created')
try:
print('now downloading{}'.format(tag))
all_class(tag)
except:
pass
driver.quit()#最后一定记得手动关掉PhantomJS浏览器