python 爬虫 存入 PostgreSQL Selenium使用

1.在Python中可以用来连接PostgreSQL的模块很多,这里比较推荐psycopg2:
2.安装:pip install psycopg2
在这里插入图片描述
3.爬取信息 存入postgre 使用Selenium :

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as  EC
from bs4 import BeautifulSoup
from time import sleep
import psycopg2
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 全国空气质量信息 爬取
broser = webdriver.Chrome()

# 城市List
cityList = []
# 网站更新时间
updateTimeList = []
# H
next24hourList = []
next48hourList = []
next72hourList = []

# 24小时---------------------
# 指标 N-N
aqiNumStart24List =[]
aqiNumEnd24List =[]
# 主要污染物
majorPollutant24sList =[]
# 污染程度
pollutionLever24List =[]

# 48小时---------------------
# 指标 N-N
aqiNumStart48List =[]
aqiNumEnd48List =[]
# 主要污染物
majorPollutant48sList =[]
# 污染程度
pollutionLever48List =[]

# 72小时---------------------
# 指标 N-N
aqiNumStart72List =[]
aqiNumEnd72List =[]
# 主要污染物
majorPollutant72sList =[]
# 污染程度
pollutionLever72List =[]


# 获取数据
def get_data():
    try:
        broser.get('http://106.37.208.228:8082/')
        # 显示等待 (确保指定节点被加载出来)
        wait = WebDriverWait(broser, 10)
        # 在指定时间内 指定id被加载出来 则打印源码 否则 抛出异常
        wait.until(EC.presence_of_element_located((By.ID, 'list')))
        # print 'html:', broser.page_source
        soup = BeautifulSoup(broser.page_source, 'lxml')
        # 24h---------------------------------------------------------------------------------------------------------------
        # 24 h city
        for i in range(1, 47):
            city = soup.select_one(
                '#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.sites-name').get_text()
            print 'city : ', city
            cityList.append(city)
        # 24 h 数字指标
        for i in range(1, 47):
            A = soup.select_one(
                '#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
            B = soup.select_one(
                '#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
            print '24h A :', A
            print '24h B :', B
            aqiNumStart24List.append(A)
            aqiNumEnd24List.append(B)
        # 24 h aqi_item 主要污染物
        for i in range(1, 47):
            aqi_item = soup.select_one(
                '#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
            print '24h aqi_item : ', aqi_item
            majorPollutant24sList.append(aqi_item)
        # 24 h 污染程度
        for i in range(1, 47):
            cd = soup.select_one('#list > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
            cdIndex = cd.strip().find('|')
            print '24h cd : ', cd[cdIndex + 5:].lstrip()
            pollutionLever24List.append(cd[cdIndex + 5:].lstrip())

        # 48h---------------------------------------------------------------------------------------------------------------
        # 48 h 数字指标
        for i in range(1, 47):
            A = soup.select_one(
                '#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
            B = soup.select_one(
                '#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
            print '48h A :', A
            print '48h B :', B
            aqiNumStart48List.append(A)
            aqiNumEnd48List.append(B)
        # 48 h aqi_item
        for i in range(1, 47):
            aqi_item = soup.select_one(
                '#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
            print '48h aqi_item : ', aqi_item
            majorPollutant48sList.append(aqi_item)
        # 48 h 污染程度
        for i in range(1, 47):
            cd = soup.select_one('#list_48hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
            cdIndex = cd.strip().find('|')
            print '48h cd : ', cd[cdIndex + 5:].lstrip()
            pollutionLever48List.append(cd[cdIndex + 5:].lstrip())
        # 72h---------------------------------------------------------------------------------------------------------------
        # 72 h 数字指标
        for i in range(1, 47):
            A = soup.select_one(
                '#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(1)').get_text()
            B = soup.select_one(
                '#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > p > span.aqi-num1 > span:nth-child(2)').get_text()
            print '72h A :', A
            print '72h B :', B
            aqiNumStart72List.append(A)
            aqiNumEnd72List.append(B)
        # 72 h aqi_item
        for i in range(1, 47):
            aqi_item = soup.select_one(
                '#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div > span.aqi-item').get_text()
            print '72h aqi_item : ', aqi_item
            majorPollutant72sList.append(aqi_item)
        # 72 h 污染程度
        for i in range(1, 47):
            cd = soup.select_one('#list_72hour > div:nth-child(' + i.__str__() + ') > a > div.pad5-t > div').get_text()
            cdIndex = cd.strip().find('|')
            print '72h cd : ', cd[cdIndex + 5:].lstrip()
            pollutionLever72List.append(cd[cdIndex + 5:].lstrip())

        # 获取网站更新时间
        updateTime = soup.select_one('#contaniner > div.title-bar > span').get_text()
        print updateTime[0:4]
        print updateTime[5:7]
        print updateTime[8:10]
        print updateTime[11:13]

        riqi = updateTime[0:4] + '-' + updateTime[5:7] + '-' + updateTime[8:10] + '-' + updateTime[11:13]
        print '网站更新日期为 ; ' ,riqi
        for i in range(46):
            updateTimeList.append(riqi)
            next24hourList.append(24)
            next48hourList.append(48)
            next72hourList.append(72)

    except Exception:
        print '爬取出错...'
    finally:
        sleep(10)
        broser.close()

# 存入 固定信息 city 到 PostgreSQL
def insert_post_greg():
    for cityLists in zip(cityList):
        dataList0 = [cityLists]
        print '城市0 :',dataList0
        #     database – 数据库名称
        #      user – 用户名
        #      password – 密码
        #      host – 服务器地址 (如果不提供默认连接Unix Socket)
        #      port – 连接端口 (默认5432)
        conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
        # 使用cursor()方法获取操作游标
        cursor = conn.cursor()
        try:
            print '城市: ', cityList
            insertsql = "INSERT into t_basic_air_quality(city) VALUES(%s)"
            cursor.execute(insertsql, dataList0)
            conn.commit()
            print ('okok')
        except Exception as e:
            print e
            conn.rollback()
        cursor.close()
        # 关闭数据库连接
        conn.close()



# 获取id  postgre查询
def get_id():
    # 获取城市 id
    conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
    cursor = conn.cursor()
    sql = 'SELECT id FROM t_basic_air_quality'
    cursor.execute(sql)
    # 获得数据
    data = cursor.fetchall()
    return data
    # 关闭指针和数据库
    cursor.close()
    conn.close()

# 存 postgre
def inset():
    idList = get_id()
    # for idLists,aqiNumStart24Lists,majorPollutant24sLists,pollutionLever24Lists,next24hourLists,updateTimeLists,aqiNumEnd24Lists in zip(idList,aqiNumStart24List,majorPollutant24sList,pollutionLever24List,next24hourList,updateTimeList,aqiNumEnd24List):
    #     dataList24 = [idLists,aqiNumStart24Lists,majorPollutant24sLists,pollutionLever24Lists,next24hourLists,updateTimeLists,aqiNumEnd24Lists]
    #     conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
    #     # 使用cursor()方法获取操作游标
    #     cursor = conn.cursor()
    #     try:
    #         print '城市: ', cityList
    #         insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
    #         cursor.execute(insertsql, dataList24)
    #         conn.commit()
    #         print ('okok')
    #     except Exception as e:
    #         print e
    #         conn.rollback()
    #     cursor.close()
    #     # 关闭数据库连接
    #     conn.close()

    # for idLists,aqiNumStart48Lists,majorPollutant48sLists,pollutionLever48Lists,next48hourLists,updateTimeLists,aqiNumEnd48Lists in zip(idList,aqiNumStart48List,majorPollutant48sList,pollutionLever48List,next48hourList,updateTimeList,aqiNumEnd48List):
    #     dataList48 = [idLists,aqiNumStart48Lists,majorPollutant48sLists,pollutionLever48Lists,next48hourLists,updateTimeLists,aqiNumEnd48Lists]
    #     conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
    #     # 使用cursor()方法获取操作游标
    #     cursor = conn.cursor()
    #     try:
    #         print '城市: ', cityList
    #         insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
    #         cursor.execute(insertsql, dataList48)
    #         conn.commit()
    #         print ('okok')
    #     except Exception as e:
    #         print e
    #         conn.rollback()
    #     cursor.close()
    #     # 关闭数据库连接
    #     conn.close()
    print '进入 insert 方法'
    for idLists, aqiNumStart72Lists, majorPollutant72sLists, pollutionLever72Lists, next72hourLists, updateTimeLists,aqiNumEnd72Lists in zip(idList, aqiNumStart72List, majorPollutant72sList, pollutionLever72List, next72hourList, updateTimeList,aqiNumEnd72List):
        dataList72 = [idLists, aqiNumStart72Lists, majorPollutant72sLists, pollutionLever72Lists, next72hourLists, updateTimeLists,aqiNumEnd72Lists]
        print dataList72
        conn = psycopg2.connect(host="xxx", user="dac", password="dac", database="cmp_dac")
        # 使用cursor()方法获取操作游标
        cursor = conn.cursor()
        try:
            print '城市: ', cityList
            insertsql = "INSERT INTO t_basic_air_quality_detail(a_id,aqi_num_start,major_pollutants,pollution_lever,next_hour,air_update_time,aqi_num_end) VALUES(%s,%s,%s,%s,%s,%s,%s)"
            cursor.execute(insertsql, dataList72)
            conn.commit()
            print ('okok')
        except Exception as e:
            print e
            conn.rollback()
        cursor.close()
        # 关闭数据库连接
        conn.close()

if __name__ == "__main__":
    # 获取所有信息
    get_data()
    # 存入 city 经纬度到 postgre
    # insert_post_greg()
    inset()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值