Python3爬取新浪微博头条

  有了上一次网易云音乐的实践之后, 这一次轻车熟路, 制作了一个新浪微博头条榜的爬虫, 代码如下:

#!/usr/bin/python 
#-*- coding: utf-8 -*-
import re
import sys
import datetime
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.action_chains import ActionChains

def scrap_sina_toplist():
    try:
        # 打开网页: 
        browser = webdriver.Chrome(chrome_options=options)
        browser.get("https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=topindex")

        # 第一位排名的序号为2, 因为新浪微博有置顶的飙升最快的事件!
        xpath_casename_left = "/html/body/div[1]/div[2]/div[2]/table/tbody/tr["
        xpath_casename_right = "]/td[2]/a"

        # 同理, 此序号也为2开始!!!
        xpath_hotpoint_left = "/html/body/div[1]/div[2]/div[2]/table/tbody/tr["
        xpath_hotpoint_right = "]/td[2]/span"

        
        # 用列表类型来整合toplist, each_info存放每一个的表单
        toplist = list()
        each_info = dict()

        # 时间戳处理:
        year = datetime.datetime.now().strftime("%Y")
        month = datetime.datetime.now().strftime("%m")
        day = datetime.datetime.now().strftime("%d")
        hour = datetime.datetime.now().strftime("%H")
        minute = datetime.datetime.now().strftime("%M")
        now_time = year + u"/" + month + u"/" + day + u":" + hour + u":" + minute

        for i in range(2, 22):  # 仅仅处理前20位的新闻变化, 放弃排名较靠下的新闻
       
            # 头条事件名处理:
            try: 
                xpath_casename = xpath_casename_left + str(i) + xpath_casename_right
                each_info["name"] = (browser.find_element_by_xpath(xpath_casename))
                # 正则表达式,去掉歌名中的逗号, 并将"name"处理为str类型!:
                each_info["name"] = re.sub(",", "", each_info["name"].text)
            except:
                break
            # 头条对应热度处理:
            try:
                xpath_hotpoint = xpath_hotpoint_left + str(i) + xpath_hotpoint_right
                each_info["hotpoint"] = (browser.find_element_by_xpath(xpath_hotpoint))
            except:
                break
           
            # 排行榜序列索引
            each_info["index"] = i - 1
                          
            # 处理时间戳  
            each_info["time"] = now_time
            
            # 加入处理后的数据到列表toplist中:
            toplist.append(each_info)
            each_info = dict()
            
        # 输出到文件中:
        # 打开文件追加a模式
        filename = ".\sina\sina_hotlist.txt"
        fileout = open(filename, "a+", encoding='utf-8')
        #写入数据:
        for i in toplist:
        
            fileout.writelines(i["name"] + "," + str(i["index"]) + "," + i["hotpoint"].text + "," + i["time"] + "\n")
            
    finally:
        # 关闭文件和窗口:
        fileout.close()
        browser.close()


#-----------------------------------------  程序开始处 -------------------------------------------#

# 设置Chrome请求头(非无头模式):
options = webdriver.ChromeOptions()
options.add_argument('--headless')

options.add_argument('lang=zh_CN.UTF-8') # 设置中文
options.add_argument('disable-infobars') #隐藏"Chrome正在受到自动软件的控制"
options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
    # 更换头部
user_agent = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) " +
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
    )
options.add_argument('user-agent=%s'%user_agent)


# 运行爬虫程序:
scrap_sina_toplist()

# 退出程序:
#    sys.exit(0)


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值