python爬虫小实例

使用Python爬取网页数据

最新推荐文章于 2025-03-10 12:19:36 发布

hello小工

最新推荐文章于 2025-03-10 12:19:36 发布

阅读量400

点赞数

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/prom1201/article/details/50814810

本文介绍了一种利用Python中的urllib2库和BeautifulSoup进行网页数据抓取的方法。通过构造请求头和设置代理服务器来获取指定链接的内容，并尝试解析网页以下载图片等资源。

from bs4 import BeautifulSoup
import urllib2
import urllib, os, re, time, sys
#import socket

def build_request(link):
# user_agent = 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'

values = {'name' : 'Michael Foord',

'location' : 'Northampton',

'language' : 'Python' }

headers = { 'User-Agent' : 'Custom User-Agent' }

data=urllib.urlencode(values)

req = urllib2.Request(link, data, headers)

req.add_unredirected_header('User-Agent', 'Custom User-Agent')

return req

def build_urllib2(link):

print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time()))

#set timeout

urllib2.socket.setdefaulttimeout(60)

#set proxy

null_proxy_handler = urllib2.ProxyHandler({"http" : 'http://10.159.32.155:8080'})
# null_proxy_handler = urllib2.ProxyHandler({"http" : 'http://10.144.1.10:8080'})

null_proxy_handler = urllib2.ProxyHandler({})

opener = urllib2.build_opener(null_proxy_handler)

urllib2.install_opener(opener)

print 'after install opener'

print time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time()))

def get_img(link, path):

print 'start to download '+ link

content=[]

try:

print link
req= build_request(link)

print 'start to build_request'

content = urllib2.urlopen(req)

# content = opener.open(req)

print 'start to create soup about '
print content

except urllib2.HTTPError, e:

print 'Error happened'

print e.code

print e.msg

print e.headers

print e.url

print e.fp.read()

#weblink = "http://www.163.com"

weblink = "http://qt.gtimg.cn/q=s_sh601899"

#weblink = "http://blog.chinaunix.net/uid-7448695-id-2626493.html"

mypath = "D:\\python\\gif\\test\\img"

build_urllib2(weblink)

get_img(weblink, mypath)