import requests
from bs4 import BeautifulSoup as bs4
from urllib.request import urlretrieve
import json
import csv
import re
url_1='https://m.weibo.cn/api/container/getSecond?containerid=1005053517129745_-_FOLLOWERS'
def cookie():
with open('cookie.txt','r') as f:
cookies={}
for line in f.read().split(';'):
name,value=line.strip().split('=',1)
cookies[name]=value
return cookies
headers = {
'User-Agent': r'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
r'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
'Referer': 'https://m.weibo.cn',
'Connection': 'keep-alive'
}
#抓取微博用户
def get_uid(url_1):
r =requests.get(url_1,headers=headers,cookies=cookie()).text
a=json.loads(r)
num=a['data']['count']
Max=num//10+2
results=[]
for i in range(1,Max):
url_1=url_1+'&page='+str(i)
r=requests.get(url_1,headers=headers,cookies=cookie()).text
a=json.loads(r)
b=a['data']['cards']
for j in range(10):
try:
uid=b[j]['user']['id']
name=b[j]['user']['screen_name']
results.append([uid,name])
except:
continue
return results
#处理文本
def TextPro(before):
after=''
for i in before:
if i=='<':
break
else:
after=after+i
return after
#打印视频链接,抓取图片(不包含转发的微博)
def get_weibo(url_3):
r =requests.get(url_2,headers=headers,cookies=cookie()).text
a=json.loads(r)
weibo=a['data']['cards']
for i in range(13):
try:
try:
retweeted_status=weibo[i]['mblog']['retweeted_status'] #类型
#print('转发(忽略不计)......')
except:
WeiBo=weibo[i]
try:
xx=WeiBo['mblog']['obj_ext']
link=WeiBo['scheme']
text=TextPro(WeiBo['mblog']['text'])
print('-------------------------------------------------------------------\n%s\n\n%s\n\n'%(text,link))
except:
pictures=WeiBo['mblog']['pics']
for j in range(9):
jpg=pictures[j]['url']
name=pictures[j]['pid']
string =name+'.jpg'
urlretrieve(jpg,string)
except:
continue
uids=get_uid(url_1)
for uid in uids:
print(uid)
search = input('请输入所要爬取的用户id:')
url_2='https://m.weibo.cn/api/container/getIndex?uid='+search+'&luicode=10000012&lfid=1005053517129745_-_FOLLOWERS&featurecode=20000320&type=uid&value='+search+'&containerid=107603'+search
for page in range(1,10):
url_3=url_2+'&page='+str(page)
get_weibo(url_3)
爬取微博
最新推荐文章于 2024-12-25 17:04:45 发布