首先,分析百度贴吧url结构:
列如:
http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=1200
http://tieba.baidu.com/f? + kw={要搜索的关键字} + &ie=utf-8 + &pn=1200
kw:要搜索的关键字 &ie=utf-8 字符集 &pn=1200 页码数 1200是25页,规律是 (pn-1)*50 ,pn是页码数。
代码如下:
(这里是python2的代码,python3中urllib2换成urllib.request即可
)
# coding:utf-8
import urllib2,urllib
class mySpider:
def __init__(self,url,beginPage,endPage):
self.url = url
self.beginPage = beginPage
self.endPage = endPage
def tiebaSpider(self,url,beginPage,endPage):
for page in range(beginPage,endPage+1):
pn = (page - 1) *50
filename = "第" + str(page) + "页.html"
fullurl = url + "&pn=" +str(pn)
print fullurl
def loadPage(fullurl, filename):
print "there is downloading..." + filename
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
request = urllib2.Request(fullurl, headers=headers)
response = urllib2.urlopen(request)
print "000000000000000"
html = response.read()
print "正在存储" + filename
with open(filename, 'w') as f:
f.write(html)
print "-" * 20
loadPage(fullurl,filename)
if __name__ == "__main__":
kw = raw_input("请输入要爬取的网页的关键字: ")
startPage = int(raw_input("请输入开始页:"))
endPage = int(raw_input("请输入终止页:"))
url = "http://tieba.baidu.com/f?"
key = urllib.urlencode({"kw":kw})
url = url + key + "&ie=utf-8"
a = mySpider(url,startPage,endPage)
a.tiebaSpider(url,startPage,endPage)