1.在Python中可以用来连接PostgreSQL的模块很多,这里比较推荐psycopg2:
2.安装:pip install psycopg2
3.爬取信息 存入postgre 使用Selenium :
# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from time import sleep
import psycopg2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 全国空气质量信息 爬取
broser = webdriver.Chrome()
# 城市List
cityList = []
# 网站更新时间
updateTimeList = []
# H
next24hourList = []
next48hourList = []
next72hourList = []
# 24小时---------------------
# 指标 N-N
aqiNumStart24List =[]
aqiNumEnd24List =[]
# 主要污染物
majorPollutant24sList =[]
# 污染程度
pollutionLever24List =[]
# 48小时---------------------
# 指标 N-N
aqiNumStart48List =[]
aqiNumEnd48List =[]
# 主要污染物
majorPollutant48sList =[]
# 污染程度
pollutionLever48List =[]
# 72小时---------------------
# 指标 N-N
aqiNumStart72List =[]
aqiNumEnd72List =[]
# 主要污染物
majorPollutant72sList =[]
# 污染程度
pollutionLever72List =[]
# 获取数据
def get_data():
try:
broser.get('http://106.37.208.228:8082/')
# 显示等待 (确保指定节点被加载出来)
wait = WebDriverWait(broser, 10)
# 在指定时间内 指定id被加载出来 则打印源码 否则 抛出异常
wait.until(EC.presence_of_element_located((By.ID, 'list')))
# print 'html:', broser.page_source
soup = BeautifulSoup(broser.page_source, 'lxml')
# 24h---------------------------------------------------------------------------------------------------------------
# 24 h city
for i in range(1, 47):
city = soup.select_one(
'#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.sites-name').get_text()
print 'city : ', city
cityList.append(city)
# 24 h 数字指标
for i in range(1, 47):
A = soup.select_one(
'#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
B = soup.select_one(
'#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
print '24h A :', A
print '24h B :', B
aqiNumStart24List.append(A)
aqiNumEnd24List.append(B)
# 24 h aqi_item 主要污染物
for i in range(1, 47):
aqi_item = soup.select_one(
'#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
print '24h aqi_item : ', aqi_item
majorPollutant24sList.append(aqi_item)
# 24 h 污染程度
for i in range(1, 47):
cd = soup.select_one('#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
cdIndex = cd.strip().find('|')
print '24h cd : ', cd[cdIndex + 5:].lstrip()
pollutionLever24List.append(cd[cdIndex + 5:].lstrip())
# 48h---------------------------------------------------------------------------------------------------------------
# 48 h 数字指标
for i in range(1, 47):
A = soup.select_one(
'#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
B = soup.select_one(
'#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
print '48h A :', A
print '48h B :', B
aqiNumStart48List.append(A)
aqiNumEnd48List.append(B)
# 48 h aqi_item
for i in range(1, 47):
aqi_item = soup.select_one(
'#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
print '48h aqi_item : ', aqi_item
majorPollutant48sList.append(aqi_item)
# 48 h 污染程度
for i in range(1, 47):
cd = soup.select_one('#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
cdIndex = cd.strip().find('|')
print '48h cd : ', cd[cdIndex + 5:].lstrip()
pollutionLever48List.append(cd[cdIndex + 5:].lstrip())
# 72h---------------------------------------------------------------------------------------------------------------
# 72 h 数字指标
for i in range(1, 47):
A = soup.select_one(
'#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
B = soup.select_one(
'#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
print '72h A :', A
print '72h B :', B
aqiNumStart72List.append(A)
aqiNumEnd72List.append(B)
# 72 h aqi_item
for i in range(1, 47):
aqi_item = soup.select_one(
'#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
print '72h aqi_item : ', aqi_item
majorPollutant72sList.append(aqi_item)
# 72 h 污染程度
for i in range(1, 47):
cd = soup.select_one('#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
cdIndex = cd.strip().find('|')
print '72h cd : ', cd[cdIndex + 5:].lstrip()
pollutionLever72List.append(cd[cdIndex + 5:].lstrip())
# 获取网站更新时间
updateTime = soup.select_one('#contaniner > div.title-bar > span').get_text()
print updateTime[0:4]
print updateTime[5:7]
print updateTime[8:10]
print updateTime[11:13]
riqi = updateTime[0:4] + '-' + updateTime[5:7] + '-' + updateTime[8:10] + '-' + updateTime[11:13]
print '网站更新日期为 ; ' ,riqi
for i in range(46):
updateTimeList.append(riqi)
next24hourList.append(24)
next48hourList.append(48)
next72hourList.append(72)
except Exception:
print '爬取出错...'
finally:
sleep(10)
broser.close()
# 存入 固定信息 city 到 PostgreSQL
def insert_post_greg():
for cityLists in zip(cityList):
dataList0 = [cityLists]
print '城市0 :',dataList0
# database – 数据库名称
# user – 用户名
# password – 密码
# host – 服务器地址 (如果不提供默认连接Unix Socket)
# port – 连接端口 (默认5432)
conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
try:
print '城市: ', cityList
insertsql = "INSERT into t_basic_air_quality(city) VALUES(%s)"
cursor.execute(insertsql, dataList0)
conn.commit()
print ('okok')
except Exception as e:
print e
conn.rollback()
cursor.close()
# 关闭数据库连接
conn.close()
# 获取id postgre查询
def get_id():
# 获取城市 id
conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
cursor = conn.cursor()
sql = 'SELECT id FROM t_basic_air_quality'
cursor.execute(sql)
# 获得数据
data = cursor.fetchall()
return data
# 关闭指针和数据库
cursor.close()
conn.close()
# 存 postgre
def inset():
idList = get_id()
# for idLists,aqiNumStart24Lists,majorPollutant24sLists,pollutionLever24Lists,next24hourLists,updateTimeLists,aqiNumEnd24Lists in zip(idList,aqiNumStart24List,majorPollutant24sList,pollutionLever24List,next24hourList,updateTimeList,aqiNumEnd24List):
# dataList24 = [idLists,aqiNumStart24Lists,majorPollutant24sLists,pollutionLever24Lists,next24hourLists,updateTimeLists,aqiNumEnd24Lists]
# conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
# # 使用cursor()方法获取操作游标
# cursor = conn.cursor()
# try:
# print '城市: ', cityList
# insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insertsql, dataList24)
# conn.commit()
# print ('okok')
# except Exception as e:
# print e
# conn.rollback()
# cursor.close()
# # 关闭数据库连接
# conn.close()
# for idLists,aqiNumStart48Lists,majorPollutant48sLists,pollutionLever48Lists,next48hourLists,updateTimeLists,aqiNumEnd48Lists in zip(idList,aqiNumStart48List,majorPollutant48sList,pollutionLever48List,next48hourList,updateTimeList,aqiNumEnd48List):
# dataList48 = [idLists,aqiNumStart48Lists,majorPollutant48sLists,pollutionLever48Lists,next48hourLists,updateTimeLists,aqiNumEnd48Lists]
# conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
# # 使用cursor()方法获取操作游标
# cursor = conn.cursor()
# try:
# print '城市: ', cityList
# insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insertsql, dataList48)
# conn.commit()
# print ('okok')
# except Exception as e:
# print e
# conn.rollback()
# cursor.close()
# # 关闭数据库连接
# conn.close()
print '进入 insert 方法'
for idLists, aqiNumStart72Lists, majorPollutant72sLists, pollutionLever72Lists, next72hourLists, updateTimeLists,aqiNumEnd72Lists in zip(idList, aqiNumStart72List, majorPollutant72sList, pollutionLever72List, next72hourList, updateTimeList,aqiNumEnd72List):
dataList72 = [idLists, aqiNumStart72Lists, majorPollutant72sLists, pollutionLever72Lists, next72hourLists, updateTimeLists,aqiNumEnd72Lists]
print dataList72
conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
try:
print '城市: ', cityList
insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insertsql, dataList72)
conn.commit()
print ('okok')
except Exception as e:
print e
conn.rollback()
cursor.close()
# 关闭数据库连接
conn.close()
if __name__ == "__main__":
# 获取所有信息
get_data()
# 存入 city 经纬度到 postgre
# insert_post_greg()
inset()