爬取微博

import requests
from bs4 import BeautifulSoup as bs4
from urllib.request import urlretrieve
import json
import csv
import re

url_1='https://m.weibo.cn/api/container/getSecond?containerid=1005053517129745_-_FOLLOWERS'

def cookie():
    with open('cookie.txt','r') as f:
        cookies={}
        for line in f.read().split(';'):
            name,value=line.strip().split('=',1)
            cookies[name]=value 
        return cookies

headers = {
    'User-Agent': r'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
                    r'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx',
    'Referer': 'https://m.weibo.cn',
    'Connection': 'keep-alive'
    }

#抓取微博用户
def get_uid(url_1):
    r =requests.get(url_1,headers=headers,cookies=cookie()).text
    a=json.loads(r)
    num=a['data']['count']
    Max=num//10+2
    results=[]
    for i in range(1,Max):
        url_1=url_1+'&page='+str(i)
        r=requests.get(url_1,headers=headers,cookies=cookie()).text
        a=json.loads(r)
        b=a['data']['cards']
        for j in range(10):
            try:
                uid=b[j]['user']['id']
                name=b[j]['user']['screen_name']
                results.append([uid,name])
            except:
                continue
    return results

#处理文本
def TextPro(before):
    after=''
    for i in before:
        if i=='<':
            break
        else:
            after=after+i
    return after

#打印视频链接,抓取图片(不包含转发的微博)
def get_weibo(url_3):
    r =requests.get(url_2,headers=headers,cookies=cookie()).text
    a=json.loads(r)   
    weibo=a['data']['cards']
    for i in range(13):
        try:
            try:
                retweeted_status=weibo[i]['mblog']['retweeted_status'] #类型
                #print('转发(忽略不计)......')
            except:
                WeiBo=weibo[i]
                try:
                    xx=WeiBo['mblog']['obj_ext']
                    link=WeiBo['scheme']
                    text=TextPro(WeiBo['mblog']['text'])
                    print('-------------------------------------------------------------------\n%s\n\n%s\n\n'%(text,link))
                except:
                    pictures=WeiBo['mblog']['pics']
                    for j in range(9):
                        jpg=pictures[j]['url']
                        name=pictures[j]['pid']
                        string =name+'.jpg'
                        urlretrieve(jpg,string)
        except:
            continue

uids=get_uid(url_1)
for uid in uids:
    print(uid)

search = input('请输入所要爬取的用户id:')
url_2='https://m.weibo.cn/api/container/getIndex?uid='+search+'&luicode=10000012&lfid=1005053517129745_-_FOLLOWERS&featurecode=20000320&type=uid&value='+search+'&containerid=107603'+search
for page in range(1,10):
    url_3=url_2+'&page='+str(page)
    get_weibo(url_3)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值