使用python3和python2爬取百度贴吧数据保存到本地,仅作为python2和python3在urllib2模块使用上的区别的记录
python2版本
#coding:utf-8
# import urllib.request
import urllib2
# import urllib.parse
import urllib
class BaiduSpider(object):
def __init__(self):
self.base_url = "http://tieba.baidu.com/f?"
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.tieba_name = raw_input("请输入需要抓取的贴吧:")
self.start_page = int(raw_input("请输入需要抓取的起始页:"))
self.end_page = int(raw_input("请输入需要抓取的结束页:"))
def send_request(self, url):
""" 接收url地址,发送请求返回响应"""
print("[INFO]: 正在发送请求 : {}...".format(url))
# 构建并发送请求
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
# 返回响应对象
return response
# def parse_response(self):
# pass
def save_data(self, response, file_name):
""" 接收并保存数据到磁盘文件中 """
print("[INFO]: 正在保存数据 {}".format(file_name))
with open(file_name, "w") as f:
f.write(response.read())
def main(self):
for page in range(self.start_page, self.end_page + 1):
# 通过计算页码值,得出pn值
pn = (page - 1) * 50
# 构建查询参数字典(贴吧名,页数)
query_dict = {"pn" : pn, "kw" : self.tieba_name}
# 构建查询字符串
query_str = urllib.urlencode(query_dict)
# base_url和查询字符串拼接,构建完整的url地址
full_url = self.base_url + query_str
#print(full_url)
# 发送url请求,返回响应
response = self.send_request(full_url)
# python强类型动态解释型语言
# 构建当前响应的文件名
file_name = self.tieba_name + str(page) + ".html"
# 保存数据
self.save_data(response, file_name)
if __name__ == '__main__':
spider = BaiduSpider()
spider.main()
python3版本
import urllib.request
import urllib.parse
class BaiduSpider(object):
def __init__(self):
self.base_url = "http://tieba.baidu.com/f?"
self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
self.tieba_name = input("请输入需要抓取的贴吧:")
self.start_page = int(input("请输入需要抓取的起始页:"))
self.end_page = int(input("请输入需要抓取的结束页:"))
def send_request(self, url):
""" 接收url地址,发送请求返回响应"""
print("[INFO]: 正在发送请求 : {}...".format(url))
# 构建并发送请求
request = urllib.request.Request(url, headers=self.headers)
response = urllib.request.urlopen(request)
# 返回响应对象
return response
# def parse_response(self):
# pass
def save_data(self, response, file_name):
""" 接收并保存数据到磁盘文件中 """
print("[INFO]: 正在保存数据 {}".format(file_name))
with open(file_name, "wb") as f:
f.write(response.read())
def main(self):
for page in range(self.start_page, self.end_page + 1):
# 通过计算页码值,得出pn值
pn = (page - 1) * 50
# 构建查询参数字典(贴吧名,页数)
query_dict = {"pn" : pn, "kw" : self.tieba_name}
# 构建查询字符串
query_str = urllib.parse.urlencode(query_dict)
# base_url和查询字符串拼接,构建完整的url地址
full_url = self.base_url + query_str
#print(full_url)
# 发送url请求,返回响应
response = self.send_request(full_url)
# python强类型动态解释型语言
# 构建当前响应的文件名
file_name = self.tieba_name + str(page) + ".html"
# 保存数据
self.save_data(response, file_name)
if __name__ == '__main__':
spider = BaiduSpider()
spider.main()
新手学习的话运行测试看看区别,下次给爬取贴吧图片的代码