python网络爬虫实战--重点整理

最新推荐文章于 2025-07-21 16:41:03 发布

原创最新推荐文章于 2025-07-21 16:41:03 发布 · 1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#网络爬虫

python 专栏收录该内容

9 篇文章

订阅专栏

本文介绍了Python爬虫的基础知识，包括使用urllib2模块发起HTTP请求并处理响应的方法，如何设置超时时间，如何使用代理服务器访问目标网址，以及如何修改请求头以欺骗网站识别。

第四章--python爬虫常用模块

urllib2.urlopen(url,timeout)请求返回响应，timeout是超时时间设置

#! python2.7
#-*- coding:utf-8 -*-


import urllib2

def linkBaidu():
    url='http://www.baidu.com'
    try:
        response=urllib2.urlopen(url,timeout=4)
    except urllib2.URLError:
        print("网络地址错误")
        exit()
    with open('baiduResponse.txt','w') as fp:   #写入文档
        fp.write(response.read())
    print(response.geturl())    #获取url信息
    print(response.getcode())  #返回状态码
    print(response.info())  #返回信息


if __name__=='__main__':
    linkBaidu()

使用代理服务器来访问url
有免费的代理服务器，但是使用urlopen的使用很容易出现<urlopen error timed out>，所以需要循环调用urlopen()。

#-*-coding:utf-8 -*-
'''
测试代理proxy是否有效
'''
import urllib2,re

class TestProxy():
    def __init__(self,proxy):
        self.proxy=proxy
        self.checkProxyFormat(self.proxy)
        self.url='http://www.baidu.com'
        self.timeout=4
        self.keyword='百度'    #在网页返回的数据中查找这个词
        self.useProxy(proxy)

    def checkProxyFormat(self,proxy):
        try:
            match=re.compile(r'^http[s]?://[\d]{1,3}.[\d]{1,3}.[\d]{1,3}.[\d]{1,3}:[\d]{1,5}$')
            match.search(proxy).group()
        except AttributeError:
            print("你输入的代理地址格式不正确")
            exit()
        flag=1
        proxy=proxy.replace('//','')
        try:
            protocol=proxy.split(':')[0]
            ip=proxy.split(':')[1]
            port=proxy.split(':')[2]
        except IndexError:
            print('下标出界')
            exit()
        flag=flag and ip.split('.')[0] in map(str,xrange(1,256))   #map对每个数应用到str函数
        flag=flag and ip.split('.')[1] in map(str,xrange(256))
        flag=flag and ip.split('.')[2] in map(str,xrange(256))
        flag=flag and ip.split('.')[3] in map(str,xrange(1,255))
        flag=flag and protocol in ['http','https']
        flag=flag and port in map(str,xrange(1,65535))
        if flag:
            print('输入的http代理服务器符合标准')
        else:
            exit()

    def useProxy(self,proxy):
        protocol=proxy.split('//')[0].replace(':','')
        ip=proxy.split('//')[1]
        print(protocol,ip)
        '''
        build_opener ()返回的对象具有open()方法，与urlopen()函数的功能相同，
        install_opener 用来创建（全局）默认opener。这个表示调用urlopen将使用你安装的opener
        '''
        opener=urllib2.build_opener(urllib2.ProxyHandler({protocol:ip}))  #protocol:http ip:163.125.68.237:8888
        urllib2.install_opener(opener)
        for i in range(10):
            try:
                response=urllib2.urlopen(self.url,timeout=5)
                break
            except Exception as e:
                print(e)
        str=response.read()
        if re.search(self.keyword,str):
            print("已提取特征词，该代理可用")
        else:
            print('该代理不可用')

if __name__=='__main__':
    proxy=r'http://163.125.68.237:8888'
    TestProxy(proxy)

修改header
网站是通过浏览器发送过来的User-Agent的值来确认浏览器的身份的，所以可能有些网站不允许被程序访问，所以我们发送请求时需要修改User-Agent欺骗网站，利用add_header()可以添加头部。同一网站会给不同的浏览器访问不同的内容。
下面程序是用IE和手机版的UC访问有道翻译。

#-*-coding:utf-8 -*-
import userAgents
import urllib2

class ModifyHeader():
    def __init__(self):
        piua=userAgents.pcUserAgent.get('IE 9.0')
        muua=userAgents.mobileUserAgent.get('UC standard')
        print('piua: '+piua)
        self.url='http://fanyi.youdao.com'
        self.userAgent(piua,1)
        self.userAgent(muua,2)

    def userAgent(self,agent,name):
        request=urllib2.Request(self.url)
        request.add_header(agent.split(':')[0],agent.split(':')[1])

        response=urllib2.urlopen(request)
        filename=str(name)+'.html'
        with open(filename,'w') as fp:
            fp.write('%s\n\n'%agent)
            fp.write(response.read())

if __name__=='__main__':
    ModifyHeader()

getpass.getuser()

返回当前用户名。这个函数会按顺序检查环境变量LOGNAME, USER, LNAME和USERNAME。返回第一个非空的值。如果检查不到非空的值，模块会尝试导入pwd模块，如果系统支持pwd模块，会返回通过pwd模块获取的用户名，否则报错。
re模块
\A：仅匹配字符串开头，如\Aabc
\Z：仅匹配字符串结尾，如abc\Z