废话不多说,直接上代码
今天要倒霉的网站是农业科学院搜索引擎
# -*- coding: utf-8 -*-
import requests
import pymysql
from bs4 import BeautifulSoup # 用来解析网页
import uuid
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
'Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
data = {"q": "_text_:能源环保",
"wt": "json",
"rows": "10",
"start": "0",
"enableElevation": "true",
"forceElevation": "true",
"fl": "_uuid_,title,content,indexUrl,pubDateStr,pubDate,size,score,[elevated]",
"qf": "title^100",
"fq": "",
"shards": "127.0.0.1:80/search/7_http,127.0.0.1:80/search/8_http,127.0.0.1:80/search/9_http,127.0.0.1:80/search/67_http,127.0.0.1:80/search/10_http,127.0.0.1:80/search/11_http,127.0.0.1:80/search/12_http,127.0.0.1:80/search/13_http,127.0.0.1:80/search/17_http,127.0.0.1:80/search/14_http,127.0.0.1:80/search/15_http,127.0.0.1:80/search/66_http,127.0.0.1:80/search/40_http,127.0.0.1:80/search/63_http,127.0.0.1:80/search/65_http,127.0.0.1:80/search/56_http,127.0.0.1:80/search/47_http,127.0.0.1:80/search/64_http,127.0.0.1:80/search/39_http,127.0.0.1:80/search/45_http,127.0.0.1:80/search/37_http,127.0.0.1:80/search/36_http,127.0.0.1:80/search/25_http,127.0.0.1:80/search/29_http,127.0.0.1:80/search/18_http,127.0.0.1:80/search/38_http,127.0.0.1:80/search/43_http,127.0.0.1:80/search/30_http,127.0.0.1:80/search/19_http,127.0.0.1:80/search/49_http,127.0.0.1:80/search/22_http,127.0.0.1:80/search/48_http,127.0.0.1:80/search/57_http,127.0.0.1:80/search/51_http,127.0.0.1:80/search/46_http,127.0.0.1:80/search/41_http,127.0.0.1:80/search/27_http,127.0.0.1:80/search/55_http,127.0.0.1:80/search/53_http,127.0.0.1:80/search/54_http,127.0.0.1:80/search/52_http,127.0.0.1:80/search/42_http,127.0.0.1:80/search/59_http,127.0.0.1:80/search/44_http,127.0.0.1:80/search/31_http,127.0.0.1:80/search/50_http,127.0.0.1:80/search/26_http,127.0.0.1:80/search/35_http,127.0.0.1:80/search/28_http,127.0.0.1:80/search/32_http,127.0.0.1:80/search/33_http,127.0.0.1:80/search/20_http,127.0.0.1:80/search/69_http,127.0.0.1:80/search/21_http,127.0.0.1:80/search/68_http,127.0.0.1:80/search/23_http,127.0.0.1:80/search/58_http,127.0.0.1:80/search/60_http,127.0.0.1:80/search/34_http,127.0.0.1:80/search/61_http,127.0.0.1:80/search/62_http,127.0.0.1:80/search/24_http,127.0.0.1:80/search/16_http"
}
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset='utf8')
cur = conn.cursor()
print("连接成功")
for i in range(0, 1000, 10): # 爬取第一页到第3页的数据
data['start'] = i
resp = requests.post(f"http://search.caas.cn/search/7_http/elevate", data=data, headers=headers).json()
dd = resp['response']['docs']
for ss in dd:
# id
productId = str(uuid.uuid1())
# 标题
title = ss['title']
# 地址
indexUrl = ss['indexUrl']
# 时间
pubDateStr = ss['pubDateStr']
# 内容
content = ss['content']
# 来源
# 过滤掉文档
if indexUrl.endswith('doc'):
continue
resp = requests.get(indexUrl)
page_one = BeautifulSoup(resp.content, "html.parser")
# 来源
source = ''
print(indexUrl)
if page_one.find('div', class_='articleAuthor') is None or page_one.find('div', class_='articleAuthor').find(
'strong') is None:
continue
else:
source = page_one.find('div', class_='articleAuthor').find('strong').text.strip()
sql = "insert into knowledge(id,title,source,timet,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s)"
cur.execute(sql, (productId, title, source, pubDateStr, content, "能源环保", indexUrl))
print("SQL正在执行第{}页执行完毕".format(i))
conn.commit()
time.sleep(1) # 防止服务器蹦了,间隔一秒钟
cur.close()
conn.close()
看下效果

很完美,回家睡觉


本文介绍了一种使用Python爬取农业科学院搜索引擎中关于能源环保信息的方法。通过发送POST请求并利用BeautifulSoup解析网页内容,实现了从指定网站抓取标题、发布时间、来源等详细信息,并将这些信息存入MySQL数据库。

2475

被折叠的 条评论
为什么被折叠?



