# _*_ coding: utf-8 _*_
"""实现定量爬取搜狐网站新闻
Author: HIKARI
Version: V 0.2
"""
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import time
from pyquery import PyQuery as pq
import pymongo
from bs4 import BeautifulSoup
import pandas as pd
'新闻url列表'
url_list = []
'新闻url总数'
num = 0
'新闻标题数量'
name_n = 0
MONGO_URL = 'localhost'
'新闻标题列表'
title_list = []
'阅读量列表'
reading_list = []
'存储阅读量切割后的字符'
list_cut = []
list_num =[]
'标题与阅读量'
rank = {
}
"打开搜狐新闻网站后,获取所有板块下的url,在新窗口中爬取其中的标题和正文信息以及阅读量,并实现存储"
'存储到mongodb'
def save_mongo(article):
MONGO_DB = 'souhu_news'
MONGO_COLLECTION = 'news'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
try:
if db[MONGO_COLLECTION].insert_one(article):
print("存储成功")
except Exception:
print("存储失败")
'根据阅读量排名输出新闻标题'
def reading_rank(max):
global rank
global reading_list
global title_list
global list_cut
for i in range
python爬取搜狐新闻网站所有新闻的标题和正文并按阅读量排行输出
最新推荐文章于 2025-03-18 20:44:49 发布