#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
#
# CrawlSinaBycookie.py
# Copyright 2016 bitwater <bitwater@bitwater>
# My Gmail is bitwater1997@gmail.com
'''
开发环境:
因为网上的python2 中文资源比较全,所以开发环境是 python2.7 linux ubuntu server eclipse
使用的库:
使用lxml进行的网页分析,因为据说lxml比bs4快,
模拟等入部分太复杂,先写一个基于cookie的,以后慢慢学rsa
获取url使用的requests #pip install request
文件使用codecs 也是因为默认文件处理编码问题太多 #pip install codecs
爬取的网站:
爬取的是移动版微博, cookie 获取网站是 https://passport.weibo.cn/signin/login
在chrome 打开网站 -> 右键检查 -> Network -> 刷新网页 -> 观察获取的文件 -> 点击登入 -> 找到m.weibo.cn
->打开->找到Request Headers -> Cookie: 后面的就是cookie
用户的ID 是在url中找 e.g http://m.weibo.cn/u/2518300370
输出 :
写入文件 当前路径的/微博爬出输出信息.txt
注意 :
因为未用多线程 , 所以速度较慢
整个运行过程为了提高效率,没有输出过程,
参考自 http://www.tuicool.com/articles/ja2ayqi
优化了部分过程, 原文的代码我测试的时候并没有爬到所有的微博
'''
import requests
from lxml import etree
import codecs
class weibo:
cookie = {"Cookie": "_T_WM=10e482205eab95a2636214f62e66b6e7;\
SUB=_2A251T7kXDeRxGeNG6VYV8ivLyziIHXVWs8dfrDV6PUJbkdBeLUXZkW1Nyy70dahwL14LSBt_hnzScyc5bw..; \
SUHB=0U1Js2WagBcf52; SCF=AhXb86-PSqrCi8mVQRzXm32YePzMWe4GFaKqUxe9Gj4FEmj2HggLPQ4518hS13OH_zSo590ni7K2yIxZPWngMRA.; \
SSOLoginState=1481361735"} # 将your cookie替换成自己的cookie
def __init__ (self, user_id):
self.user_id = user_id
self.user_name = ''
self.information = []
self.weiboNum2 = 0 # 爬取到的微博数
self.Cntweibo = 0;
self.all_weibos = {}
self.allpages = 0
def start(self):
self.getUserName()
self.getAllweibo()
self.writeInfo()
def getUserName(self):
url = 'http://weibo.cn/%d?filter=1&page=1' % (self.user_id)
html = requests.get(url , cookies=weibo.cookie).content
selector = etree.HTML(html)
userName = selector.xpath("//head/title")
'''
获取姓名
'''
self.user_name = userName[0].text[:-3] # .encode('utf-8')
# print type(selector.xpath("//head"))
'''
获取基本信息
'''
str_wb = selector.xpath("//div[@class='tip2']")[0].xpath("string(.)") # .replace('\n', '').replace(' ', '')
# print type(str_wb)
self.information = str_wb.split()
'''
获取页数
'''
tmp = selector.xpath("//input[@name='mp']/@value")
# print int(tmp[0])
self.allpages = int(tmp[0])
def getAllweibo(self):
# self.allpages = 2
for page in range(1, self.allpages + 1):
url = 'http://weibo.cn/%d?filter=1&page=%d' % (self.user_id, page)
html = requests.get(url , cookies=weibo.cookie).content
selector = etree.HTML(html)
str_wb = selector.xpath("//span[@class='ctt']")
l = len(str_wb)
# for i in range(3, l):
# print "%d" % (i - 2) + " " + str_wb[i].xpath("string(.)") + '\n'
#
for i in range(3, l):
self.Cntweibo = self.Cntweibo + 1
self.all_weibos[self.Cntweibo] = str_wb[i].xpath("string(.)") + '\n'
def writeInfo(self):
fw = codecs.open("微博爬出输出信息.txt", "w", "utf-8")
fw.write(self.user_name + '\n')
fw.write(self.information[0] + '\n')
fw.write(self.information[1] + '\n')
fw.write(self.information[2] + '\n')
fw.write(u"所有的原创微博\n")
for i in range(1, self.Cntweibo + 1):
fw.write((u' 第%d条微博 : \n') % i + self.all_weibos[i])
fw.close()
if __name__ == '__main__':
user_id = input("input the user id ")
wb = weibo(user_id)
wb.start()
python2 爬取新浪帐号所有微博
最新推荐文章于 2024-12-05 09:48:46 发布