* 上传数据前需确认清楚自己是否有权限访问host,以及确认清楚具体的上传路径(hdfs_path)
import pandas as pd
from impala.dbapi import connect
from hdfs.client import Client
from hdfs import InsecureClient
import pyhdfs
def getDatabaseConnection():
"""
获取数据库连接和游标
"""
return connect(host=XX, port=XX, auth_mechanism=XX, user=XX, password=XX, database=XX)
def executeSqlString(sql_string):
"""
执行SQL语句
"""
with getDatabaseConnection().cursor() as cur:
cur.execute(sql_string)
return 'SQL已执行'
def uploadDataframe2Database(table_name, create_table_sql, df_to_upload):
# pick active node
fs = pyhdfs.HdfsClient(hosts=[http1,http2,http3])
active_nodes=fs.get_active_namenode()
# Drop existing table
drop_table_sql = f'DROP TABLE IF EXISTS {table_name}'
executeSqlString(drop_table_sql)
# Create new table
executeSqlString(create_table_sql)
# Upload DataFrame to HDFS
hdfs_path = XX
client = InsecureClient(active_nodes)
with client.write(hdfs_path, overwrite=True, encoding='utf-8') as writer:
df_to_upload.to_csv(writer, index=False, header=False, sep='\001')
return f'{table_name} 数据已上传!'
df_test = pd.DataFrame({'class':['A','B','C'],'rank':[3,2,3]})
table_name = table_name
create_table_sql = f'''
CREATE TABLE table_name(
class STRING,
rank INT
)
row format delimited fields terminated by '\001' lines terminated by '\n' STORED AS textfile
'''
print(uploadDataframe2Database(table_name,create_table_sql,df_test))
数据上传请看:Python+SQL:在python中调用sql文件并运行转化为Datafarme-优快云博客