文本处理脚本

最新推荐文章于 2024-11-28 08:04:33 发布

原创最新推荐文章于 2024-11-28 08:04:33 发布 · 799 阅读

0 ·

CC 4.0 BY-SA版权

python 专栏收录该内容

3 篇文章

订阅专栏

最近要帮师兄处理一些数据，一个个粘贴太麻烦了，于是写了一个文本处理的脚本，只针对特定任务可用。但先留存下来，如果以后需要再进行修改

#-*- encoding:utf8 -*-


import os
import re
import sys

__author__ = 'sangoly'


class DataExtractor():

    def __init__(self, source_path='.', target_path=None):
        self._source_path = source_path
        if not target_path:
            self._target_path = self._source_path
        else:
            self._target_path = target_path

    def do_extract_action(self):
        file_name_info_list = self._file_scanner()
        self._extract_data(file_name_info_list)

    def _file_scanner(self):
        file_list = os.listdir(self._source_path)
        x_source_name = None
        for index, file_name in enumerate(file_list):
            if file_name.startswith("bps_"):
                x_source_name = file_name
                break
        if not x_source_name:
            print "Not found x file, please check it and try again."
            exit(0)
        file_list.remove(x_source_name)
        x_source_name, source_suffix = x_source_name.split('.')
        y_source_number = x_source_name.split('_')[1]
        if int(y_source_number) != len(file_list):
            print "The Y files' number is not correct!"
            exit(0)
        y_source_name_segement = file_list[0].split('_')[:-1]
        y_source_name = '_'.join(y_source_name_segement) + "_"
        return [x_source_name, y_source_name, y_source_number, '.' + source_suffix]

    def _extract_data(self, file_name_info_list, target_file_name="result"):
        x_source_name = file_name_info_list[0]
        y_source_name = file_name_info_list[1]
        y_source_number = file_name_info_list[2]
        source_suffix = file_name_info_list[3]

        x_prefix = "x=["
        y_prefix = "y=["
        suffix = "];"

        x_coordinate_list = []
        y_coordinate_list = []

        #Get the x coordinate
        x_source_real_path = os.path.join(os.sep, self._source_path,
                                          x_source_name + source_suffix)
        try:
            x_file = open(x_source_real_path)
            for x_coordinate in x_file.readlines():
                x_coordinate_list.append(x_coordinate.strip())
            x_file.close()

            #Get the y coordinate
            re_rule = r'total\s+\S+\s'
            re_pattern = re.compile(re_rule)
            for i in range(1, int(y_source_number) + 1):
                y_source_file_real_path = os.path.join(os.sep, self._source_path,
                                                       y_source_name + str(i) + source_suffix)
                y_file = open(y_source_file_real_path)
                y_file_content = y_file.read()
                re_result = re.findall(re_pattern, y_file_content)
                data_string = re_result[0].split('\t')[1].replace(',', '.').strip()
                y_coordinate_list.append(data_string)
                y_file.close()

            target_file_real_path = os.path.join(os.sep, self._target_path,
                                                 target_file_name + source_suffix)

            target_file = open(target_file_real_path, 'w')
            target_file.write(x_prefix + ','.join(x_coordinate_list) + suffix + '\n\n')
            target_file.write(y_prefix + ','.join(y_coordinate_list) + suffix)
            target_file.close()
        except IOError, e:
            print 'File not exist.'
            print e

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print "The argument's number must be two at least(include the script name)"
        exit(0)
    #As default the target_path is same to source_path
    source_path = sys.argv[1]
    target_path = None
    if len(sys.argv) > 2:
        target_path = sys.argv[2]
    data_extractor = DataExtractor(source_path, target_path)
    data_extractor.do_extract_action()