python
使用python的pandas导入csv,不需要先创建表格。
postgre会自动创建表格和合适的数据类型。
安装psycopg2: pip install psycopg2
from sqlalchemy import create_engine
import pandas as pd
import os
# csv文件所在的文件夹
# 如果只有一个文件
Folder_Path = r'C:\Users'
table1 = pd.read_csv(Folder_Path + '\\' + '1.csv', header=0, encoding='utf-8')
connect = create_engine(
'postgresql+psycopg2://' + 'pg_username' + ':' + 'pg_password' +
'@pg_host' + ':' + str(5432) + '/' + 'Database')
pd.io.sql.to_sql(table1, name='table1', con=connect, index=False,
if_exists='replace', chunksize=1000)
connect.dispose()
# 可以将多个文件放在同一文件夹下做批量处理
os.chdir(Folder_Path)
file_list = os.listdir() # 获取文件夹下所有的文件名
for i in range(0, len(file_list)):
table2 = pd.read_csv(Folder_Path + '\\' + file_list[i], header=0, encoding='utf-8')
...
create_engine(
'postgresql+psycopg2://' + 'pg_username' + ':' + 'pg_password' + '@pg_host' + ':'
+ str(5432) + '/' + 'Database')
pd.io.sql.to_sql(table1, name='table2', con=connect, index=False,
if_exists='replace', chunksize=1000)
connect.dispose()
处理千万行数据快速很多的方法,将pd.io.sql.to_sql换成下面的函数:
def write_to_table(df, table_name, if_exists='fail'):
import io
import pandas as pd
from sqlalchemy import create_engine
db_engine = create_engine('postgresql+psycopg2://'+'pg_username'+':'+'pg_password'+'@pg_host'+':'+str(5432) + '/' + 'Database')# 初始化引擎
string_data_io = io.StringIO()
# index = True时,dataframe index 成为表格的一列
df.to_csv(string_data_io, sep='|', index=False)
pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine)
table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df,
index=False, if_exists=if_exists,schema = 'public')
table.create()
string_data_io.seek(0)
# string_data_io.readline() # remove header
with db_engine.connect() as connection:
with connection.connection.cursor() as cursor:
copy_cmd = "COPY public.%s FROM STDIN HEADER DELIMITER '|' CSV" %table_name
cursor.copy_expert(copy_cmd, string_data_io)
connection.connection.commit()
write_to_table(save_file, table_name, if_exists='replace')
如果想规定每一列的数据类型
import psycopg2
import csv
database_postgres = psycopg2.connect(database="postgres", user="postgres", password="postgres",
host="localhost", port="5432")
in_data = csv.reader(open('C:/Users/K2/Ivy/1.csv', 'r', encoding='UTF-8'))
cursor = database_postgres.cursor()
print('Inserting data...')
cursor.execute("DROP TABLE IF EXISTS temp;")
# 自定义数据类型
cursor.execute("create table temp ("
"col1 text,"
"col2 integer,"
"col3 text);")
for row in in_data:
for i in range(len(row)):
if row[i] == '':
row[i] = None
cursor.execute("INSERT INTO temp VALUES (%s,%s,%s)",
[row[0], row[1], row[2]])
print('Done inserting data...')
database_postgres.commit()
database_postgres.close()
另一种规定数据类型的方法:
from sqlalchemy.types import VARCHAR,DECIMAL,Integer
def setdtypedict(df):
dtypedict = {}
for i, j in zip(df.columns, df.dtypes):
if "object" in str(j):
dtypedict.update({i: VARCHAR(128)})
if "float" in str(j):
dtypedict.update({i: DECIMAL(19, 2)})
# 将所有float类型只保留小数点后两位
if "int" in str(j):
dtypedict.update({i: Integer})
return dtypedict
dtypedict = setdtypedict(df)
pd.io.sql.to_sql(df, name='table1', con=connect, index=True, index_label='time',
if_exists='replace', dtype=dtypedict, chunksize=1000)
psql
- 创建新表:
postgres=# create table temp(col1 text,col2 text,col3 text);
- 从文件导入数据
postgres=# \copy temp (col1, col2, col3) FROM 'C:/Users/1.csv' DELIMITER ',' CSV ENCODING 'UTF8';
pgadmin
- 需要先根据csv创建表格
- 选择表—>右键—>导入
- 选择要导入的csv文件,编码格式,分隔符
- 导入