预备知识:
HTTP报文格式
cookie作用
熟悉IE,Chrome or Firefox的 F12键
参考:
http://fc-lamp.blog.163.com/blog/static/174566687201110284220980?suggestedreading
www.crifan.com/files/doc/docbook/web_scrape_emulate_login/release/html/web_scrape_emulate_login.html
http://www.blogjava.net/hongqiang/archive/2012/08/01/384552.html
http://www.2cto.com/kf/201401/275152.html
____________________________________________________________________
模拟登陆codeforces
先手动登陆,抓下发送POST的数据包,看看里面的首部(标头)和内容是什么
我们要做的就是模仿浏览器发包
首部和参数有些是可选的,不填也没关系,有一些则是必要的
首部只需要填Referer和User-Agent
不过代码里面我把第一次访问的代码也注释掉了,这样也能成功,说明登陆codeforces并不需要cookie = =。。
而postdata参数只要求action,handle和password。。不过这个csrf_token还是很好找的,就在前段代码里面
在调用urllib2.openurl发包之前不要忘了用urllib.urlencode将数据编码
import urlparse
import HTMLParser
import urllib
import urllib2
import cookielib
import string
import re
myhandle = 'yourhandle'
mypassword = 'yourpassword'
host = 'codeforces.com'
hosturl = 'http://codeforces.com/'
posturl = 'http://codeforces.com/enter'
csrf_token = ''
# cookie处理器,将cookie下载到本地,并且在发送请求时带上本地cookie
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
# helper function
def save2file(text, filename):
f1 = open(filename, 'w')
f1.write(text)
f1.close()
# 先访问 'http://codeforces.com/enter' 获取带有csrf_token的前端代码
def first_visit():
h = urllib2.urlopen(posturl)
print u"第一次访问,获取前端代码"
print h.geturl(), h.getcode()
print h.info()
text = h.read()
myfile = open('body', 'w')
myfile.write(text)
myfile.close()
# 从前段代码中获取 csrf_token
def extract_csrf_token():
global csrf_token
f1 = open('body','r')
allLines = f1.readlines()
f1.close()
for line in allLines:
p1 = line.find("name='csrf_token' value=")
if p1 != -1:
p2 = line.find("value=", p1)
p3 = line.find("'/>", p1)
p2 += 7
csrf_token = line[p2:p3]
break
# 填写首部和postdata
headers = {'Referer' : 'http://codeforces.com/enter',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
postdata = {'action' : 'enter',
'handle' : myhandle,
'password' : mypassword
}
# 数据编码
postdata = urllib.urlencode(postdata)
if __name__ == '__main__':
#first_visit()
#extract_csrf_token()
req = urllib2.Request(posturl, postdata, headers)
response = urllib2.urlopen(req)
print u'返回信息'
print response.geturl(), response.getcode()
print response.info()
text = response.read()
save2file(text, 'response')
#cj.save('response_cookie')