# -*- coding:utf-8 -*-
from lxml import etree
import urllib2
import random
import urlparse
# 设置网络代理
proxy_info = {
'host': '127.0.0.1',
'port': 8080
}
proxy_support = urllib2.ProxyHandler({'http': '%(host)s:%(port)d' %proxy_info})
openner = urllib2.build_opener(proxy_support)
urllib2.install_opener(openner)
#user_Agent列表
user_agent_list = [
"Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)",
"Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)",
"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)"
]
ua_header = {'User-Agent': random.choice(user_agent_list)}
url = 'http://www.baidu.com'
request = urllib2.Request(url, headers=ua_header)
try:
response = urllib2.urlopen(request, timeout=10)
print(response.getcode())
result = response.read()
res_htm = etree.HTML(result)
tab_inner = res_htm.xpath("//div[@class='s_tab_inner']/*/text()")
tab_href = res_htm.xpath("//div[@class='s_tab_inner']/*/@href")
index = 0
for inner in tab_inner:
if str(inner.encode('utf-8')) == '音乐':
index = tab_inner.index(inner)
index = int(index) - 1
url_music = str(tab_href[index])
request = urllib2.Request(url_music, headers=ua_header)
response = urllib2.urlopen(request, timeout=60)
print(response.getcode())
result = response.read()
res_htm = etree.HTML(result)
# print etree.tostring(res_htm, encoding='utf-8')
responsive = res_htm.xpath("//div[@id='responsive']//div[@class='search-info']//a/@href")
column = urlparse.urlparse(responsive[0])
param = urlparse.parse_qs(column.query)
id = param['id'][0]
print(id)
except:
print("网络超时,请稍后再试")
先上代码(Talk is cheap. Show me the code),见上。
本实例使用urllib2请求网页,lxml解析网页结构,urlparse解析请求参数。
- 爬虫伪装的第一步是提供User-Agent,如果只使用一个user-agent,会存在被封IP的风险,所以这里我们使用列表随机数据。当然也可以引入一些包来实现,比如fake_useragent。
- 使用lxml包查找网页元素时,需要先了解一些xpath语法。