#coding=utf-8
import urllib2,urllib
import simplejson
seachstr = 'hello'
for x in range(6):
print "page:%s"%(x+1)
page = x * 8
url = ('https://ajax.googleapis.com/ajax/services/search/web'
'?v=1.0&q=%s&rsz=8&start=%s') % (urllib.quote(seachstr),page)
try:
# class urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable])
request = urllib2.Request(
url, None, {'Referer': 'http://www.sina.com'})
response = urllib2.urlopen(request) #urlopen返回的是文件对象
# Process the JSON string.
results = simplejson.load(response)
infoaaa = results['responseData']['results']
except Exception,e:
print e
else:
for minfo in infoaaa:
print minfo['url']
原理详解:
1.google api
https://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=hello&rsz=2&start=1
q:搜索内容
rsz:每页显示条数
start:第几页
2. request = urllib2.Request(url, None, {'Referer':'http://www.sina.com'})
print request:
<urllib2.Request instance at0x02527670>
3. response = urllib2.urlopen(request)
print response:
<addinfourl at 38972272 whose fp =<socket._fileobject object at 0x025249F0>> 此为文件句柄
4. results = simplejson.load(response)
得到json的数据格式,利用json在线校验器可以得到清晰的样式。
print results:
{
u'responseData':{
u'cursor': {
...省略...
},
u'results':[
{
u'GsearchResultClass':u'GwebSearch',
u'visibleUrl':u'www.hellodesign.com',
u'titleNoFormatting': u'Hello',
u'title':u'<b>Hello</b>',
u'url': u'http://www.hellodesign.com/',
u'cacheUrl': u'http://www.google.com/search?q=cache: 2q-UgYT2lOsJ: www.hellodesign.com',
u'unescapedUrl': u'http://www.hellodesign.com/',
u'content': u'<b>Hello</b>isaninteractivedesignagency.Wecreateintelligent,
livingsystemsforpeopletoexperience.Webelieveinbuildingsystemsthatareuseful,
usable,
and<b>...</b>'
},
{
u'GsearchResultClass':u'GwebSearch',
u'visibleUrl':u'www.youtube.com',
u'titleNoFormatting':u'HellobyLionelRichie-YouTube',
u'title':u'<b>Hello</b>byLionelRichie-YouTube',
u'url': u'http: //www.youtube.com/watch%3Fv%3Db_ILDFp5DGA',
u'cacheUrl': u'http://www.google.com/search?q=cache: yrhDbK-_ZcEJ: www.youtube.com',
u'unescapedUrl': u'http://www.youtube.com/watch?v=b_ILDFp5DGA',
u'content': u'Jul8,
2009<b>...</b>ThemusicvideoforLionelRichie's"<b>Hello</b>"directedbyBobGiraldi,
attractsattentionasittellsthestoryofamusicteacher(playedbyLionel<b>...</b>'
}
]
},
u'responseDetails': None,
u'responseStatus': 200
}
5. infoaaa = results['responseData']['results']
从results中取出results
6. for minfo in infoaaa:
printminfo['url']
最后从infoaaa中取出url链接
http: //www.hellodesign.com/
http://www.youtube.com/watch%3Fv%3Db_ILDFp5DGA