#!/usr/bin/python
# -*- coding: utf-8 -*-
import re;
import cookielib;
import urllib;
import urllib2;
import optparse;
#------------------------------------------------------------------------------
# check all cookies in cookiesDict is exist in cookieJar or not
def checkAllCookiesExist(cookieNameList, cookieJar) :
cookiesDict = {};
for eachCookieName in cookieNameList :
cookiesDict[eachCookieName] = False;
allCookieFound = True;
for cookie in cookieJar :
if(cookie.name in cookiesDict) :
cookiesDict[cookie.name] = True;
for eachCookie in cookiesDict.keys() :
if(not cookiesDict[eachCookie]) :
allCookieFound = False;
break;
return allCookieFound;
#------------------------------------------------------------------------------
# just for print delimiter
def printDelimiter():
print '-'*80;
#------------------------------------------------------------------------------
# main function to emulate login baidu
def emulateLogin():
print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/";
print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword";
printDelimiter();
# parse input parameters
parser = optparse.OptionParser();
parser.add_option("-u","--username",action="store",type="string",default='',dest="username",help="Your Baidu Username");
parser.add_option("-p","--password",action="store",type="string",default='',dest="password",help="Your Baidu password");
(options, args) = parser.parse_args();
# export all options variables, then later variables can be used
for i in dir(options):
exec(i + " = options." + i);
printDelimiter();
print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies";
cj = cookielib.CookieJar();
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));
urllib2.install_opener(opener);
printDelimiter();
print "[step1] to get cookie 58921";
mainUrl="http://58921.com/";
resp=urllib2.urlopen(mainUrl);
#baiduMainUrl = "http://www.baidu.com/";
#resp = urllib2.urlopen(baiduMainUrl);
#respInfo = resp.info();
#print "respInfo=",respInfo;
for index, cookie in enumerate(cj):
print '[',index, ']',cookie;
printDelimiter();
print "[step2] to get token value";
gettokenUrl="http://58921.com/user/login";
gettokenResp=urllib2.urlopen(gettokenUrl);
#getapiUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true";
#getapiResp = urllib2.urlopen(getapiUrl);
#print "getapiResp=",getapiResp;
gettokenRespHtml = gettokenResp.read();
#print gettokenRespHtml;
foundTokenVal=re.search(\
"<input type=\"hidden\" name=\"form_id\" value=\"user_login_form\"/><input type=\"hidden\" name=\"form_token\" value=\"(?P<tokenVal>\w+)\"",gettokenRespHtml);
#print "getapiRespHtml=",getapiRespHtml;
#bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3';
#foundTokenVal = re.search("bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';", getapiRespHtml);
if(foundTokenVal):
tokenVal = foundTokenVal.group("tokenVal");
print tokenVal;
printDelimiter();
print "[step3] emulate login 58921";
loginUrl = "http://58921.com/user/login/ajax?ajax=submit&__q=user/login";
postDict = {
'form_id': "user_login_form",
'submit' : "%E7%99%BB%E5%BD%95",
'form_token' : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f
'mail' : username,
'pass' : password,
};
postData = urllib.urlencode(postDict);
# here will automatically encode values of parameters
# such as:
# encode http://www.baidu.com/cache/user/html/jump.html into http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html
#print "postData=",postData;
req = urllib2.Request(loginUrl, postData);
# in most case, for do POST request, the content-type, is application/x-www-form-urlencoded
req.add_header('Content-Type', "application/x-www-form-urlencoded;charset=UTF-8");
resp = urllib2.urlopen(req);
#for index, cookie in enumerate(cj):
# print '[',index, ']',cookie;
cookiesToCheck = ['remember', 'time'];
loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj);
if(loginBaiduOK):
print "+++ Emulate login 58921 is OK, ^_^";
else:
print "--- Failed to emulate login 58921 !"
else:
print "Fail to extract token value from html=",gettokenRespHtml;
if __name__=="__main__":
emulateLogin();
以上为模拟登录,58921,参考cfi登录百度首页代码
登录完之后,爬取上面的电影票房信息
#!/usr/bin/python
# -*- coding: utf-8 -*-
#---------------------------------import---------------------------------------
import urllib2;
import re;
import login;
from BeautifulSoup import BeautifulSoup;
#from HTMLParser import HTMLParser
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
def main():
file=open("C://test/testfilm.txt",'w');
j=0
while j<1100:
userMainUrl = "http://58921.com/content/film/"+str(j)+"/boxoffice"
print j;
j=j+1;
try:
req = urllib2.Request(userMainUrl);
resp = urllib2.urlopen(req);
except urllib2.HTTPError,e:
print "the page not found!";
print "error code:",e.code;
print "return content:",e.read();
continue;
except urllib2.URLError,e:
print "failed to reach the server";
print "the reason",e.reason;
continue;
else:
respHtml = resp.read();
#print "respHtml=",respHtml; # you should see the ouput html
#print "Method 2: Use python third lib BeautifulSoup to extract info from html";
songtasteHtmlEncoding = "GB2312";
soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding);
foundFilmTable = soup.find(attrs={"class":"movie_chart_header_title"});
# print "foundFilmTable=%s",foundFilmTable;
if(foundFilmTable):
filmTableSoup = foundFilmTable;
foundAllh2=filmTableSoup.findAll("h2");
if(foundAllh2):
curFilm=foundAllh2[0];
print curFilm;
file.write(curFilm.string);
file.write("\n");
foundAlltr=filmTableSoup.findAll("tr");
l=len(foundAlltr);
#print l;
row=0;
while row<l:
curTr=foundAlltr[row];
#print curTr;
row=row+1;
if(curTr):
foundAlltd=curTr.findAll("td");
ll=len(foundAlltd);
#print ll;
col=0;
while col<ll:
curTd=foundAlltd[col];
# print curTd;
col=col+1;
while(curTd):
try:
#preTd=curTd;
curTd.contents[0].contents[0]
except AttributeError,e:
#curTd=preTd;
#curTd=curTd.parent();
#leafTd=curTd;
#print curTd;
break;
else:
curTd=curTd.contents[0];
file.write(curTd.string);
file.write("\t");
file.write("\n");
file.close();
###############################################################################
if __name__=="__main__":
login.emulateLogin();
main();
本文介绍使用Python模拟登录网站并抓取特定信息的方法。通过示例代码展示了如何利用urllib2、re及BeautifulSoup等库实现登录过程及后续的数据抓取。重点在于解析网页、获取所需数据。
2834

被折叠的 条评论
为什么被折叠?



