python爬虫含登录

最新推荐文章于 2025-02-05 21:26:20 发布

原创最新推荐文章于 2025-02-05 21:26:20 发布 · 1.1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

python 专栏收录该内容

1 篇文章

订阅专栏

本文介绍使用Python模拟登录网站并抓取特定信息的方法。通过示例代码展示了如何利用urllib2、re及BeautifulSoup等库实现登录过程及后续的数据抓取。重点在于解析网页、获取所需数据。

#!/usr/bin/python
# -*- coding: utf-8 -*-

import re;
import cookielib;
import urllib;
import urllib2;
import optparse;
 
#------------------------------------------------------------------------------
# check all cookies in cookiesDict is exist in cookieJar or not
def checkAllCookiesExist(cookieNameList, cookieJar) :
    cookiesDict = {};
    for eachCookieName in cookieNameList :
        cookiesDict[eachCookieName] = False;
     
    allCookieFound = True;
    for cookie in cookieJar :
        if(cookie.name in cookiesDict) :
            cookiesDict[cookie.name] = True;
     
    for eachCookie in cookiesDict.keys() :
        if(not cookiesDict[eachCookie]) :
            allCookieFound = False;
            break;
 
    return allCookieFound;
 
#------------------------------------------------------------------------------
# just for print delimiter
def printDelimiter():
    print '-'*80;
 
#------------------------------------------------------------------------------
# main function to emulate login baidu
def emulateLogin():
    print "Function: Used to demostrate how to use Python code to emulate login baidu main page: http://www.baidu.com/";
    print "Usage: emulate_login_baidu_python.py -u yourBaiduUsername -p yourBaiduPassword";
    printDelimiter();
 
    # parse input parameters
    parser = optparse.OptionParser();
    parser.add_option("-u","--username",action="store",type="string",default='',dest="username",help="Your Baidu Username");
    parser.add_option("-p","--password",action="store",type="string",default='',dest="password",help="Your Baidu password");
    (options, args) = parser.parse_args();
    # export all options variables, then later variables can be used
    for i in dir(options):
        exec(i + " = options." + i);
 
    printDelimiter();
    print "[preparation] using cookieJar & HTTPCookieProcessor to automatically handle cookies";
    cj = cookielib.CookieJar();
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));
    urllib2.install_opener(opener);
 
    printDelimiter();
    print "[step1] to get cookie 58921";
    mainUrl="http://58921.com/";
    resp=urllib2.urlopen(mainUrl);
    #baiduMainUrl = "http://www.baidu.com/";
    #resp = urllib2.urlopen(baiduMainUrl);
    #respInfo = resp.info();
    #print "respInfo=",respInfo;
    for index, cookie in enumerate(cj):
        print '[',index, ']',cookie;
 
    printDelimiter();
    print "[step2] to get token value";
    gettokenUrl="http://58921.com/user/login";
    gettokenResp=urllib2.urlopen(gettokenUrl);
    #getapiUrl = "https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true";
    #getapiResp = urllib2.urlopen(getapiUrl);
    #print "getapiResp=",getapiResp;
    gettokenRespHtml = gettokenResp.read();
    #print gettokenRespHtml;
    foundTokenVal=re.search(\
    "<input type=\"hidden\" name=\"form_id\" value=\"user_login_form\"/><input type=\"hidden\" name=\"form_token\" value=\"(?P<tokenVal>\w+)\"",gettokenRespHtml);
    #print "getapiRespHtml=",getapiRespHtml;
    #bdPass.api.params.login_token='5ab690978812b0e7fbbe1bfc267b90b3';
    #foundTokenVal = re.search("bdPass\.api\.params\.login_token='(?P<tokenVal>\w+)';", getapiRespHtml);
    if(foundTokenVal):
        tokenVal = foundTokenVal.group("tokenVal");
        print tokenVal;
 
        printDelimiter();
        print "[step3] emulate login 58921";
        loginUrl = "http://58921.com/user/login/ajax?ajax=submit&__q=user/login";
        postDict = {
            'form_id': "user_login_form",
            'submit'       : "%E7%99%BB%E5%BD%95",
            'form_token'         : tokenVal, #de3dbf1e8596642fa2ddf2921cd6257f
            'mail'      : username,
            'pass'      : password,
        };
        postData = urllib.urlencode(postDict);
        # here will automatically encode values of parameters
        # such as:
        # encode http://www.baidu.com/cache/user/html/jump.html into http%3A%2F%2Fwww.baidu.com%2Fcache%2Fuser%2Fhtml%2Fjump.html
        #print "postData=",postData;
        req = urllib2.Request(loginUrl, postData);
        # in most case, for do POST request, the content-type, is application/x-www-form-urlencoded
        req.add_header('Content-Type', "application/x-www-form-urlencoded;charset=UTF-8");
        resp = urllib2.urlopen(req);
        #for index, cookie in enumerate(cj):
        #    print '[',index, ']',cookie;
        cookiesToCheck = ['remember', 'time'];
        loginBaiduOK = checkAllCookiesExist(cookiesToCheck, cj);
        if(loginBaiduOK):
            print "+++ Emulate login 58921 is OK, ^_^";
        else:
            print "--- Failed to emulate login 58921 !"
    else:
        print "Fail to extract token value from html=",gettokenRespHtml;
 
if __name__=="__main__":
    emulateLogin();

以上为模拟登录，58921，参考cfi登录百度首页代码

登录完之后，爬取上面的电影票房信息

#!/usr/bin/python
# -*- coding: utf-8 -*-

#---------------------------------import---------------------------------------
import urllib2;
import re;
import login;
from BeautifulSoup import BeautifulSoup;
#from HTMLParser import HTMLParser
#------------------------------------------------------------------------------

#------------------------------------------------------------------------------
def main():
   
    file=open("C://test/testfilm.txt",'w');
    j=0
    while j<1100:
        userMainUrl = "http://58921.com/content/film/"+str(j)+"/boxoffice"
        print j;
        j=j+1;
        try:
            req = urllib2.Request(userMainUrl);
            resp = urllib2.urlopen(req);
        except urllib2.HTTPError,e:
            print "the page not found!";
            print "error code:",e.code;
            print "return content:",e.read();
            continue;
        except urllib2.URLError,e:
            print "failed to reach the server";
            print "the reason",e.reason;
            continue;
        else:
            respHtml = resp.read();
    #print "respHtml=",respHtml; # you should see the ouput html
 
 
        #print "Method 2: Use python third lib BeautifulSoup to extract info from html";
        songtasteHtmlEncoding = "GB2312";
        soup = BeautifulSoup(respHtml, fromEncoding=songtasteHtmlEncoding);
        
        foundFilmTable = soup.find(attrs={"class":"movie_chart_header_title"});
       # print "foundFilmTable=%s",foundFilmTable;
        if(foundFilmTable):
            filmTableSoup = foundFilmTable;
            foundAllh2=filmTableSoup.findAll("h2");
            if(foundAllh2):
                curFilm=foundAllh2[0];
                print curFilm;
                file.write(curFilm.string);
                file.write("\n");
            foundAlltr=filmTableSoup.findAll("tr");
            l=len(foundAlltr);
            #print l;
            row=0;
            while row<l:
                curTr=foundAlltr[row];
                #print curTr;
                row=row+1;
                if(curTr):
                    foundAlltd=curTr.findAll("td");
                    ll=len(foundAlltd);
                    #print ll;
                    col=0;
                    while col<ll:
                        curTd=foundAlltd[col];
                       # print curTd;
                        col=col+1;
                       
                        
                        while(curTd):
                             try:
                                #preTd=curTd;
                                curTd.contents[0].contents[0]
                             except AttributeError,e:
                                #curTd=preTd;
                                #curTd=curTd.parent();
                                #leafTd=curTd;
                                #print curTd;
                                break;
                             else:
                                 curTd=curTd.contents[0];
                        file.write(curTd.string);
                        file.write("\t");
                    file.write("\n");
                    
    file.close();
             

 
###############################################################################
if __name__=="__main__":
    login.emulateLogin();
    main();