import requests#网页请求
import bs4#网页解析
import re#正则表达式
import os#cmd命令
import time
def strcmp(str1,str2):
if str2:#爬虫爬的数据是空不做比较
if str1[:len(str2)].replace(' ','') == str2.replace(' ',''):
return 1
elif str2[:len(str1)].replace(' ','') == str1.replace(' ',''):
return 1
return 0
def print_article(soup):
content = soup.find('div',class_='rm_txt_con cf')
if content:
for each in content:
if each and (each.string != None):
print(each.string)
return 1
content = soup.find_all('p', style='text-indent: 2em;')
if content:
for each in soup.find_all('p', style='text-indent: 2em;'):
if each and (each.string != None):
print(each.string)
elif each.span and (each.string != None):
print(each.span.string)
return 1
def save_news(soup,newsname):#收藏新闻(新闻保存到本地)
save = input("是否收藏该新闻?\n收藏请输入0以外任何字符\n不收藏请输入0")
if save == '0':
return 0
path = 'E:/py爬虫/news/' + time.strftime("%Y%m%d") + newsname.replace('《','').replace('》','').replace('"','') + '.txt'
file = open(path,'w',encoding='utf-8')
content = soup.find('div', class_='rm_txt_con cf')
if content:
print('收藏成功')
for each in content:
if each and (each.string != None):
file.write(each.string)
file.close()
return 1
content = soup.find_all('p', style='text-indent: 2em;')
if content:
prin
爬虫爬取人民网新闻
于 2022-01-25 00:42:55 首次发布