download.py:
# -*-coding:utf-8-*-
import os
import time
import utils
import configparser
import pyhdfs
def load_conf():
print("加载配置文件")
cf = configparser.ConfigParser()
cf.read("z.conf", encoding="UTF-8")
secs = cf.sections()
if len(secs) == 0:
raise TypeError("配置文件无效")
print("配置文件已加载")
print("解析配置文件")
conf_dict_list = [dict()]
for x in range(len(conf_dict_list)):
sec = secs[x]
ops_list = cf.options(sec)
for ops in ops_list:
conf_dict_list[x][ops] = cf.get(sec, ops)
print("配置文件解析成功")
return conf_dict_list[0]
def download_file(hdfs_ip, hdfs_port, hdfs_user, data_path, t_our_suffix, num_day, download_path,hdfs_temp_file_path,download_file_path):
count = 0
print("在根目录:" + data_path + "中扫描需要处理的文件夹")
# 名称获得需要处理时间的数据
time_dir_name_list = utils.get_time_list(num_day)
print("需要处理的日期是:" + ",".join(time_dir_name_list))
hdfs_client = pyhdfs.HdfsClient(hdfs_ip, hdfs_port, hdfs_user)
#返回指定的文件夹包含的文件或文件夹的名字的列表
dirs = hdfs_client.listdir(data_path)
for d_name in dirs:
t_path = data_path + '/' + d_name
# 是否为目录
if hdfs_client.get_file_status(t_path).type != "DIRECTORY":
continue
if d_name not in time_dir_name_list:
continue
# 扫描文件夹下的文件
files = hdfs_client.listdir(t_path)
for file_name in files:
file_path = t_path + "/" + file_name
# 是否以需要处理的后缀名结尾
if not file_name.endswith(t_our_suffix):
continue
# 是否为文件
if hdfs_client.get_file_status(file_path).type != "FILE":
continue
print("下载文件:" + file_path)
_, shot_name, extension = utils.get_file_path_file_name_file_ext(file_path)
# 下载文件
local_d_name = str(d_name).replace('-', '')
local_file_paths = download_path + "/" + local_d_name
if not os.path.exists(local_file_paths):
os.mkdir(local_file_paths)
local_file_path = local_file_paths + "/" + file_name
print("将hadoop文件下载到本地")
hdfs_client.copy_to_local(file_path, local_file_path, overwrite=True)
# 读取本地文件处理分类重新上传
try:
f = open(local_file_path, "r", encoding="UTF-8")
lines = f.readlines()
url = os.path.basename(local_file_path)
li = url.split('_')
except Exception:
# 文件读取失败
print('我失败了。。。。。')
# 去掉\n
temp_lines_list = [x.strip() for x in lines]
if len(temp_lines_list) < 2:
# 行数不够,不进行处理
print('行数太短了。。。。。。')
# 去掉空行
for i in temp_lines_list:
now_title = i.split("|")
while now_title:
if now_title[8] == 'cqzb':
path = os.path.join(download_file_path, now_title[8])
if not os.path.exists(path):
os.mkdir(path)
time_path = os.path.join(path, str(li[0][:4] + '-' + li[0][4:6] + '-' + li[0][6:8]))
if not os.path.exists(time_path):
os.mkdir(time_path)
print(time_path)
urls = time_path + '/' + str(
li[0] + '_' + li[1] + '_' + str(now_title[8]) + '_' + li[3] + '_' + li[4])
with open(urls, 'a', encoding='utf-8')as f:
f.write(i + '\n')
break
elif now_title[8] == 'hnzb':
path = os.path.join(download_file_path, now_title[8])
if not os.path.exists(path):
os.mkdir(path)
time_path = os.path.join(path, str(li[0][:4] + '-' + li[0][4:6] + '-' + li[0][6:8]))
if not os.path.exists(time_path):
os.mkdir(time_path)
print(time_path)
urls = time_path + '/' + str(
li[0] + '_' + li[1] + '_' + str(now_title[8]) + '_' + li[3] + '_' + li[4])
with open(urls, 'a', encoding='utf-8')as f:
f.write(i + '\n')
break
else:
break
f.close()
print('开始将文件上传到hadoop')
try:
hdfs_client.copy_from_local(local_temp_file_path, hdfs_temp_file_path, cleanup=True)
except Exception as e:
print(e)
hdfs_temp_file_path=os.path.join(hdfs_temp_file_path,str(li[0][:4] + '-' + li[0][4:6] + '-' + li[0][6:8]))
print(local_temp_file_path, hdfs_temp_file_path)
print(file_path, local_file_path)
print('文件上传hadoop完毕,继续下一步操作。。。。')
print("将Hadoop中下载完成的文件更改为processed文件防止重复处理")
hdfs_our_origin_temp_file_path = t_path + "/" + shot_name + our_suffix + "_processed"
hdfs_client.rename(file_path, hdfs_our_origin_temp_file_path)
print(file_path, hdfs_our_origin_temp_file_path)
count += 1
return count
if __name__ == '__main__':
# 加载模型
print("启动程序")
base_dict = load_conf()
our_suffix = base_dict["our_suffix"]
num_day = int(base_dict["num_day"])
hdfs_ip = base_dict["hdfs_ip"]
hdfs_port = int(base_dict["hdfs_port"])
hdfs_user = base_dict["hdfs_user"]
hdfs_data_path = base_dict["hdfs_data_path"]
download_path = base_dict["download_path"]
sleep_time = int(base_dict["sleep_time"])
hdfs_temp_file_path=base_dict["hdfs_temp_file_path"]
download_file_path=base_dict["download_file_path"]
while True:
start_time = time.time()
print("新流程开始...")
count = download_file(hdfs_ip, hdfs_port, hdfs_user, hdfs_data_path, our_suffix, num_day, download_path,hdfs_temp_file_path,download_file_path)
if count > 0:
print("此次流程处理了" + str(count) + "个文件,共用时" + str(time.time() - start_time) + "秒。")
else:
print("此次流程没有处理文件。")
print("进入睡眠,睡眠时间为" + str(sleep_time) + "秒,等待下一个流程。")
time.sleep(sleep_time)
z.conf:
[base]
#睡眠时间,秒为单位
sleep_time=30
our_suffix=.out
hdfs_ip=127.0.0.1
hdfs_port=8080
hdfs_user=yuqing
hdfs_data_path=
num_day=1
download_path=
hdfs_temp_file_path=
download_file_path=
python 读取hadoop数据并重现上传
最新推荐文章于 2025-03-25 20:52:06 发布