import requests
import os
import re
import csv
import time
import json
#headers是请求加上头信息,伪装成浏览器访问,不然会被限制
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
Cookies = {"Cookie": "_T_WM=ac858e32012c2c4bfcf46782f5928d99; WEIBOCN_FROM=1110006030; ALF=1525487789; SCF=AktDkPHfGtZ_G6P28yFN5QufvOsFbI5pFfURfdnppHMyiRVumWsnFuuqlxsaRkfm-IyfBlTdHqvtLmDZj1Bu2SI.; SUB=_2A253wfyTDeRhGeVO41YZ8ijOwjyIHXVVTYTbrDV6PUJbktANLUTmkW1NTSfFYR33sk1GxQdr6aOyC5D9YpwqQYUy; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEZ5vWGfwYo6aYTg5NIEeO5JpX5K-hUgL.Foe71hBReoqE1K52dJLoIpeLxKqL1-BLBKnLxKqL1KnL128j; SUHB=04WeilAeo6tedn; SSOLoginState=1522896067; M_WEIBOCN_PARAMS=from%3Dfeed%26oid%3D4225101094628701%26luicode%3D10000011%26lfid%3D1076033084826290%26fid%3D1005053084826290%26uicode%3D10000011"}
#当出现一些解决不了的问题时候 试着更新一下Cookies
#用户信息,同时也能获取到uid、fid、oid等关键参数
def get_user_info(usr_id):
url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={usr_id}'.format(usr_id=usr_id)
resp = requests.get(url, headers=headers, cookies=Cookies)
jsondata = resp.json()
#print(jsondata)
nickname = jsondata.get('data').get('userInfo').get('screen_name')
mblog_num = jsondata.get('data').get('userInfo').get('statuses_count')
verified = jsondata.get('data').get('userInfo').get('verified')
verified_reason = jsondata.get('data').get('userInfo').get('verified_reason')
gender = jsondata.get('data').get('userInfo').get('gender')
urank = jsondata.get('data').get('userInfo').get('urank') #用户等级
mbrank = jsondata.get('data').get('userInfo').get('mbrank')
followers_count = jsondata.get('data').get('userInfo').get('followers_count')
follow_count = jsondata.get('data').get('userInfo').get('follow_count')
uid = jsondata.get('data').get('userInfo').get('toolbar_menus')[0].get('params').get('uid')
try:
fid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('fid')
oid = jsondata.get('data').get('userInfo').get('toolbar_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid')#有时候出现oid后面不是Hotblog错误的时候把0换成1 或者1换成0
cardid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('cardid')
except:
uid = ''
fid = ''
oid = ''
cardid = ''
containerid = jsondata.get('data').get('tabsInfo').get('tabs')[0].get('containerid')
Info = {'nickname':nickname,'mblog_num':mblog_num,
'verified':verified,'verified_reason':verified_reason,
'gender':gender,'urank':urank,'mbrank':mbrank,'followers_count':followers_count,
'follow_count':follow_count,'uid':uid,'fid':fid,
'cardid':cardid,'containerid':containerid,'oid':oid
}
print(Info)
return Info
#获取所有热门微博信息(所发微博内容,创建时间,每条热门微博id,每条微博的评论数,转发数,评论数...)
def mblog_list(uid,oid):
ids=[]
base_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}'
#base_url='https://m.weibo.cn/api/container/getIndex?containerid={oid}&luicode=10000011&lfid=1005051282005885&featurecode=20000320'
#base_url= 'https://m.weibo.cn/api/container/getIndex?containerid={uid}'
page_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}&type=uid&value={uid}&page={page}'
#page_url ='https://m.weibo.cn/api/container/getIndex?containerid={uid}&page={page}'
url = base_url.format(oid=oid)
print(url)
resp = requests.get(url, headers=headers, cookies=Cookies)
resp.encoding = 'gbk'
response = resp.json()
#print(response)
#热门微博数total
total = response['data']['cardlistInfo']['total']
print(total)
#热门微博网页数
path = os.getcwd()+'/{dirname}/'.format(dirname='博主微博热门信息汇总')
os.mkdir(path)
path2 = os.getcwd() + '/%s/%s.csv'%('博主微博热门信息汇总',uid)
csvfile = open(path2, 'a+', encoding='gb18030', newline='')
writer = csv.writer(csvfile)
writer.writerow(('id','reposts_count','comments_count','attitudes_count','date','text'))
page_num = int(int(total)/10)+1
for i in range(1,page_num+1,1):
#if i==2:
#break
p_url = page_url.format(oid=oid, uid=uid, page=i)
#print(p_url)
page_resp = requests.get(p_url,headers=headers,cookies=Cookies)
resp.encoding = 'gbk'
page_data = page_resp.json()
'''filename='22.json'
with open(filename,'w') as f:
json.dump(page_data,f)'''
try:
cards = page_data['data']['cards']
#print(cards)
for card in cards:
#print(card)
try:
mblog = card['mblog']
#print(mblog)
date = mblog['created_at']
id = mblog['id']
ids.append(id)
dirty_text = mblog['text'] #dirty_text中含有很多链接杂质
cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
text = re.sub(r"<a .*?</a>", '', cleaned1)
reposts_count = mblog['reposts_count']
comments_count = mblog['comments_count']
attitudes_count = mblog['attitudes_count']
writer.writerow((id,reposts_count,comments_count,attitudes_count,date,text))
print('有%d页,已经爬了%d页 %s'%(page_num, i, id))
except:
continue
except:
continue
time.sleep(1)
return ids
#获取某微博点赞用户信息,保存到usr_id下的文件夹wb_id.csv文件中
def get_uesrlike_counts(usr_id, wb_id):
url = 'https://m.weibo.cn/api/attitudes/show?id={id}'.format(id=wb_id)
page_url = 'https://m.weibo.cn/api/attitudes/show?id={id}&page={page}'
Resp = requests.get(url, headers=headers, cookies=Cookies)
Resp.encoding = 'gbk'
#print(url)
Resp_data=Resp.json()
try:
page_max_num = Resp.json()['data']['max']
print(page_max_num)
path2 = os.getcwd() + '/%s/%s.csv'%(usr_id,wb_id)
csvfile = open(path2, 'a+', encoding='gb18030', newline='')
writer = csv.writer(csvfile)
writer.writerow(('username','user_id','verified','verified_type','profile_url','review_id','image','source','following','follow_me','date'))
for i in range(1,page_max_num+1):
#if i==2:#备注:爬取转发的量不受页数的限制 不止100页
#break
p_url = page_url.format(id=wb_id,page=i)
resp = requests.get(p_url, cookies=Cookies, headers=headers)
print(resp.status_code)#正确应该是200
resp_data = resp.json()
try:
data = resp_data.get('data').get('data')
#print(data)#此时data是一个列表
for d in data:
review_id = d['id']
user_id=d['user']['id']
#like_counts = d['like_counts']
source = d['source']
username = d['user']['screen_name']
image = d['user']['profile_image_url']
verified = d['user']['verified']
verified_type = d['user']['verified_type']
profile_url = d['user']['profile_url']
following=d['user']['following']
follow_me=d['user']['follow_me']
#comment = d['raw_text']
#cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
#comment = re.sub(r"<a .*?</a>", '', cleaned1)
date = d['created_at']
#print(comment)
writer.writerow((username,user_id, verified, verified_type, profile_url, review_id, image,source,following,follow_me,
date))
print('有%d页,已经爬了%d页 %s'%(page_max_num, i, username))
except:
print(resp_data['msg'])
continue
time.sleep(1)
csvfile.close()
except:
print(Resp_data['msg'])
def main():
#user_id= '1655128924'
#user_id='2736225585'
#user_id = '2386831995'
user_id= '2378564111'
wb_id='4225101094628701'
user_info = get_user_info(user_id)
uid = user_info.get('uid')
oid = user_info.get('oid')
print(uid,oid)
r=mblog_list(uid,oid)
print('............')
path = os.getcwd()+'/{dirname}/'.format(dirname=user_id)#先在主函数中把名字为user_id建好
os.mkdir(path)
for i in range(len(r)): #打印博主所有的热门微博的点赞用户的信息
print('这是第'+str(i)+'条热门微博')
wb_id=r[i]
get_uesrlike_counts(user_id,wb_id)
main()