爬取微博文章内容

最新推荐文章于 2024-11-21 09:43:57 发布

原创最新推荐文章于 2024-11-21 09:43:57 发布 · 950 阅读

9 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #信息可视化 #python #flask

爬虫专栏收录该内容

2 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

项目场景：

爬取微博文章内容：
导入一些模块：
import time
import requests
import csv
import os
from datetime import datetime

问题描述

爬取14个数据：

id
likeNum
commentsLen
reports_count
region
content
contentLen
created_at
type
detailUrl
authorAvatar
authorName
authorDetail
isVip

保存到csv文件：

def init():
    if not os.path.exists('./articleData.csv'):
        with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'id',
                'likeNum',
                'commentsLen',
                'reports_count',
                'region',
                'content',
                'contentLen',
                'created_at',
                'type',
                'detailUrl',
                'authorAvatar',
                'authorName',
                'authorDetail',
                'isVip'
            ])


def writerRow(row):
    with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)

完整代码：

import time
import requests
import csv
import os
from datetime import datetime


def init():
    if not os.path.exists('./articleData.csv'):
        with open('./articleData.csv', 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'id',
                'likeNum',
                'commentsLen',
                'reports_count',
                'region',
                'content',
                'contentLen',
                'created_at',
                'type',
                'detailUrl',
                'authorAvatar',
                'authorName',
                'authorDetail',
                'isVip'
            ])


def writerRow(row):
    with open('./articleData.csv', 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)


def get_data(url, params):
    headers = {
        'Cookie': 'XSRF-TOKEN=zASpYIx0oUosfBlB0MsTSRdi; SSOLoginState=1704083302; SUB=_2A25Ilk82DeThGeBI71US9yzKzzuIHXVr6s7-rDV8PUJbkNB-LWXlkW1NRpId-Znw75c-wagHUOjJucjoob6tHv3U; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZM5jTZLaMAANadOdO6n405NHD95QcSoBNe0MESoBNWs4DqcjPi--Xi-i2iK.4i--NiK.XiKLsS0e4eo-t; WBPSESS=Ii9Wh36g6mj5Z4ggI26vDWjCIui3_Ugbw4SWQGD-3thTaFTWO4WfBvG6bThO4kGKymgzVpGAtZV7ECafvFIdUVzuArqnCejbOvzVVpt49LX2IF7cmIN2gYRZz9Z8CMGcwbkBpKHIXseyKeK-4ee9gw==',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()['statuses']
    else:
        return None


def getAllTypeList():
    typeList = []
    with open('./navData.csv', 'r', encoding='utf-8') as reader:
        readerCsv = csv.reader(reader)
        next(reader)
        for nav in readerCsv:
            typeList.append(nav)
    return typeList


def parse_json(response, type):
    for article in response:
        id = article['id']
        likeNum = article['attitudes_count']
        commentsLen = article['comments_count']
        reports_count = article['reposts_count']
        try:
            region = article['region_name'].replace('发布于', '')
        except:
            region = '无'
        content = article['text_raw']
        contentLen = article['textLength']
        created_at = datetime.strptime(article['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        type = type
        try:
            detailUrl = 'https://www.weibo.com/' + str(article['id']) + '/' + str(article['mblogid'])
        except:
            detailUrl = '无'
        authorAvatar = article['user']['avatar_large']
        authorName = article['user']['screen_name']
        authorDetail = 'https://www.weibo.com/u/' + str(article['user']['id'])
        isVip = article['user']['v_plus']
        writerRow([
            id,
            likeNum,
            commentsLen,
            reports_count,
            region,
            content,
            contentLen,
            created_at,
            type,
            detailUrl,
            authorAvatar,
            authorName,
            authorDetail,
            isVip
        ])


def start(typeNum=3, pageNum=2):
    articleUrl = 'https://weibo.com/ajax/feed/hottimeline'
    init()
    typeList = getAllTypeList()
    typeNumCount = 0
    for type in typeList:
        if typeNumCount > typeNum:
            return
        time.sleep(1)
        for page in range(0, pageNum):
            print('正在爬取的类型：%s中的第%s页的文章数据' % (type[0], page + 1))
            time.sleep(1)
            params = {
                'group_id': type[1],
                'containerid': type[2],
                'max_id': page,
                'count': 10,
                'extparam': 'discover|new_feed'
            }
            response = get_data(articleUrl, params)
            parse_json(response, type[0])
        typeNumCount += 1


if __name__ == '__main__':
    start()