最近要帮师兄处理一些数据,一个个粘贴太麻烦了,于是写了一个文本处理的脚本,只针对特定任务可用。但先留存下来,如果以后需要再进行修改
#-*- encoding:utf8 -*-
import os
import re
import sys
__author__ = 'sangoly'
class DataExtractor():
def __init__(self, source_path='.', target_path=None):
self._source_path = source_path
if not target_path:
self._target_path = self._source_path
else:
self._target_path = target_path
def do_extract_action(self):
file_name_info_list = self._file_scanner()
self._extract_data(file_name_info_list)
def _file_scanner(self):
file_list = os.listdir(self._source_path)
x_source_name = None
for index, file_name in enumerate(file_list):
if file_name.startswith("bps_"):
x_source_name = file_name
break
if not x_source_name:
print "Not found x file, please check it and try again."
exit(0)
file_list.remove(x_source_name)
x_source_name, source_suffix = x_source_name.split('.')
y_source_number = x_source_name.split('_')[1]
if int(y_source_number) != len(file_list):
print "The Y files' number is not correct!"
exit(0)
y_source_name_segement = file_list[0].split('_')[:-1]
y_source_name = '_'.join(y_source_name_segement) + "_"
return [x_source_name, y_source_name, y_source_number, '.' + source_suffix]
def _extract_data(self, file_name_info_list, target_file_name="result"):
x_source_name = file_name_info_list[0]
y_source_name = file_name_info_list[1]
y_source_number = file_name_info_list[2]
source_suffix = file_name_info_list[3]
x_prefix = "x=["
y_prefix = "y=["
suffix = "];"
x_coordinate_list = []
y_coordinate_list = []
#Get the x coordinate
x_source_real_path = os.path.join(os.sep, self._source_path,
x_source_name + source_suffix)
try:
x_file = open(x_source_real_path)
for x_coordinate in x_file.readlines():
x_coordinate_list.append(x_coordinate.strip())
x_file.close()
#Get the y coordinate
re_rule = r'total\s+\S+\s'
re_pattern = re.compile(re_rule)
for i in range(1, int(y_source_number) + 1):
y_source_file_real_path = os.path.join(os.sep, self._source_path,
y_source_name + str(i) + source_suffix)
y_file = open(y_source_file_real_path)
y_file_content = y_file.read()
re_result = re.findall(re_pattern, y_file_content)
data_string = re_result[0].split('\t')[1].replace(',', '.').strip()
y_coordinate_list.append(data_string)
y_file.close()
target_file_real_path = os.path.join(os.sep, self._target_path,
target_file_name + source_suffix)
target_file = open(target_file_real_path, 'w')
target_file.write(x_prefix + ','.join(x_coordinate_list) + suffix + '\n\n')
target_file.write(y_prefix + ','.join(y_coordinate_list) + suffix)
target_file.close()
except IOError, e:
print 'File not exist.'
print e
if __name__ == '__main__':
if len(sys.argv) < 2:
print "The argument's number must be two at least(include the script name)"
exit(0)
#As default the target_path is same to source_path
source_path = sys.argv[1]
target_path = None
if len(sys.argv) > 2:
target_path = sys.argv[2]
data_extractor = DataExtractor(source_path, target_path)
data_extractor.do_extract_action()