某公司的数据处理。
比如有数据:
0 1 20100304 51.14 5 62.2 0 0.0 1 6.82 4.08666666667 4 3
0 0 20100307 2.47333333333 3 75.9333333333 0 0.0 114 13.5666666667 2.86666666667 0 0
0 2 20100318 2.49333333333 3 58.6666666667 0 0.0 1 5.22666666667 1.61333333333 0 0
0 2 20100329 9.14666666667 14 70.6428571429 0 0.0 1 31.84 7.51333333333 11 7
0 0 20100401 3.36666666667 5 71.1 1 75.0 1 1.86 1.29333333333 0 0
0 1 20100401 7.44666666667 11 58.1909090909 0 0.0 51 40.8066666667 12.5066666667 2 1
0 0 20100401 1.0 1 138.0 0 0.0 1 1.0 1.0 0 0
1 0 20100428 1.0 1 85.0 0 0.0 1 0.0 0.0 0 0
1 0 20100429 1.0 1 128.0 0 0.0 1 24.54 3.59333333333 0 0
0 0 20100506 1.92 2 67.0 0 0.0 44 6.65333333333 1.91333333333 4 1
要求:
1、除去第1列和第2列。2、对剩下的日期列提取特征,以月份为特征,形式:201003
3、对剩下的数字列,用最大值和最小值的区间,划分十份,每一份就是一个特征,形式:[1,5)、[5,9)...[21,25],注意最后一个区间闭合了。
4、用上面获取的特征,把原始数据向量化,形式:
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
开写:
先下载安装pyExcelerator:http://sourceforge.net/projects/pyexcelerator/,解压后setup.py install
# -*- coding: cp936 -*-
import numpy as np
from numpy import *
import pprint
from pyExcelerator import *
#all_col_feature存放特征们
all_col_feature = []
for i in range(2,13):
#获取[日期]列特征:201003,201004...201408...
if i == 2 :
FILENAME = "lost5"
date = [e[0:6] for e in loadtxt(FILENAME , delimiter = " " , usecols=(2,) , dtype=str)]
date = list(set(date)) #去重
date.sort()
all_col_feature.append(date)
#获取[数字]列特征:取最大值,最小值,划分10份,每一份就是一个特征,[1,5),[5,9)..[21,25]
else:
digit_col = filter(lambda e:e!=0,loadtxt(FILENAME , delimiter = " " , usecols=(i,) , dtype=float))#获取数据后去0
digit_col.sort()
max_num = digit_col[-1]
min_num = digit_col[0]
block_size = (max_num - min_num) /10
col_feature = []
for i in range(11):
col_feature.append(min_num+i*block_size)
all_col_feature.append(col_feature)
w = Workbook() #创建一个工作簿
ws = w.add_sheet('Zero_One_Data') #创建一个工作表
row_index = 0
col_index = 0
for i in range(len(all_col_feature)):
if i == 0:
for j in all_col_feature[i]:
ws.write(row_index,col_index,str(j))
col_index += 1
else:
for j in range(len(all_col_feature[i])-1):
ws.write(row_index,col_index,str(all_col_feature[i][j])+"~"+str(all_col_feature[i][j+1]))
col_index += 1
row_index += 1
out = file('out.txt','w')
res = []
for row in loadtxt(FILENAME , delimiter = " " , usecols=(2,3,4,5,6,7,8,9,10,11,12) , dtype=str):
#数据整理成这个形式:every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]
#比如:第一个0表示在[日期]列的特征下标为0,-1表示NONE值
#比如:第一个1表示在第二列的特征下标为1,心好累,我这都要注释?
every_row_feature_index = []
a = [i for i in range(len(all_col_feature[0])) if row[0][0:6] <= all_col_feature[0][i]]
every_row_feature_index.append(a[0])
for col in range(1,11):
b = [i for i in range(len(all_col_feature[col])) if float(row[col]) < float(all_col_feature[col][i])]
if len(b) == 0:#max_num
every_row_feature_index.append(len(all_col_feature)-2)
else:
every_row_feature_index.append(b[0]-1) #-1表NONE
#生成向量
#就是把every_row_feature_index = [0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1]转成下面的形式:
#[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
zero_one_every_row_res = []
col_index = 0
for i in range(len(every_row_feature_index)):
if i == 0: #处理[日期]列
for j in range(every_row_feature_index[i]):
out.write('0'+' ')
ws.write(row_index,col_index,'0')
col_index += 1
out.write('1'+' ')
ws.write(row_index,col_index,'1')
col_index += 1
for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])):
out.write('0'+' ')
ws.write(row_index,col_index,'0')
col_index += 1
else:#处理其他列
if every_row_feature_index[i] == -1: #-1表NONE值 向量全为0
for j in range(len(all_col_feature[i])-1):
out.write('0'+' ')
ws.write(row_index,col_index,'0')
col_index += 1
else:
for j in range(every_row_feature_index[i]):
out.write('0'+' ')
ws.write(row_index,col_index,'0')
col_index += 1
out.write('1'+' ')
ws.write(row_index,col_index,'1')
col_index += 1
for j in range(every_row_feature_index[i]+1,len(all_col_feature[i])-1):
out.write('0'+' ')
ws.write(row_index,col_index,'0')
col_index += 1
out.write('\n')
row_index += 1
if row_index == 100:#为了更快看到结果,只设置100行,去掉可得到全部
break
out.close()
w.save('out.xls')
看看结果: