模拟登陆，抓取会员课程

最新推荐文章于 2023-03-05 15:33:58 发布

原创最新推荐文章于 2023-03-05 15:33:58 发布 · 466 阅读

CC 4.0 BY-SA版权

基于python，使用selenium驱动Phantomjs，使用会员账号密码，模拟登陆某课程网站，根据课程类别检索课程，找到课程文档，使用Phantomjs进行截图或者保存为pdf。

关于cookie： http://blog.youkuaiyun.com/falseen/article/details/46962011

import requests
import time
import os
from lxml import etree
from selenium import webdriver
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver=webdriver.PhantomJS()#使用selenium驱动PhantomJS无头浏览器
driver.get('http://www._thetargetnet_.com/login')
account=driver.find_element_by_name('login')
account.send_keys('your_account')
password=driver.find_element_by_name('password')
password.send_keys('your_password')
password.send_keys(Keys.RETURN)

cookies={}
raw_cookie='copy the cookies here after you login in'#cookie 需要登录之后手动保存

for line in raw_cookie.split(';'):#把cookie转换成字典格式
	key,value=line.split('=',1)
	cookies[key]=value

cookies_new=requests.utils.cookiejar_from_dict(cookies,cookiejar=None,overwrite=True)#cookie从字典格式转换成cookiejar格式，才可以被session采用
s=requests.Session()#使用session，使得若干次请求可以共享cookie
s.cookies=cookies_new

def download(driver,target_path):#定义函数，将页面保存为pdf格式（来自stackoverflow）
	def execute(script,args):
		driver.execute('executePhantomScript',{'script':script,'args':args})
	driver.command_executor._commands['executePhantomScript']=('POST','/session/$sessionId/phantom/execute')
	page_format='this.paperSize={format:"A4",orientation:"portrait"};'
	execute(page_format,[])

	render='''this.render("{}")'''.format(target_path)
	execute(render,[])

def parse_class_page(tag,course_num):
		class_url='https://www.targetnet.com'+course_num
		class_page=s.get(class_url)
		class_page=etree.HTML(class_page.text)
		course_cat=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[2]/text()')[0]
		if course_cat=="训练营":
			print('this is 训练营，需要专门购买')
			return
		course_name=class_page.xpath('/html/body/div[3]/div/div[1]/div[2]/div[1]/h4/span[1]/text()')[0]
		print('the course dealing with:{}'.format(course_name))
		path='./{}/{}'.format(tag,'free_'+course_name)
		isExist= os.path.exists(path)
		if not isExist:
			os.makedirs(path)
		try:
			document_num=class_page.xpath('//*[@id="reports"]/span[@class="lab-id"]/@data-lab-id')
			length=len(document_num)
			count=1
			print('there are {} document in this course'.format(length))
			for i in document_num:
				document_url=class_url+'/labs/'+i+'/document'
				driver.get(document_url)
				time.sleep(3)
				driver.save_screenshot('{}/{}of{}.png'.format(path,count,length))#保存为图片版本
				download(driver,'{}/{}of{}.pdf'.format(path,count,length))#保存为pdf版本
				print('{} page of this class is saved'.format(count))
				count+=1
			time.sleep(1)
		except:
			pass
def class_per_page(tag,page):
	url_base='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page={}'#付费课程
	#url_base='https://www.target.com/courses/?category=all&course_type=all&fee=free&tag={}&page={}'#免费课程
	url_this_page=url_base.format(tag,page)
	this_page=s.get(url_this_page)
	this_page=etree.HTML(this_page.text)
	class_link=this_page.xpath('/html/body/div[3]/div/div/div[2]/div[3]/div/a/@href')
	print('dealing with page {} of {}:'.format(page,tag))
	for j in class_link:
		parse_class_page(tag,j)
		time.sleep(1)
	
def all_class(tag):
	tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=member&tag={}&page=1'.format(tag)#付费
	#tag_url='https://www.targetnet.com/courses/?category=all&course_type=all&fee=free&tag={}&page=1'.format(tag)#免费
	print('page url of {} is {}'.format(tag,tag_url))
	tag_page=s.get(tag_url)
	tag_page=etree.HTML(tag_page.text)
	try:
		page_num=int(tag_page.xpath('//html/body/div[3]/div/div/div[2]/nav/ul/li/a/text()')[-3])
	except:
		page_num=1
	print('there are {} pages in {}'.format(page_num,tag))
	for i in range (1,page_num+1):
		i=str(i)
		class_per_page(tag,i)

tag_list=[]
while 1:
	tag=input('需要下载的课程类别（输入‘q’表示结束）:')
	if tag=='q':
		break
	tag_list.append(tag)
for tag in tag_list:
	isExist=os.path.exists(tag)
	if not isExist:
		os.makedirs(tag)
		print('directory created')
	try:
		print('now downloading{}'.format(tag))
		all_class(tag)
	except:
		pass
driver.quit()#最后一定记得手动关掉PhantomJS浏览器