#coding:utf-8 import numpy as np import pandas as pd from sklearn.ensemble import IsolationForest ilf = IsolationForest(n_estimators=100, n_jobs=-1, # 使用全部cpu verbose=2, ) data = pd.read_excel('data.xlsx',index_col='AA') data = data.fillna(0) # 选取特征,不使用标签(类型) X_cols = ["BB", "CC"] print data.shape # 训练 ilf.fit(data[X_cols]) shape = data.shape[0] batch = 10**6 all_pred = [] for i in range(shape/batch+1): start = i * batch end = (i+1) * batch test = data[start:end] # 预测 pred = ilf.predict(test) all_pred.extend(pred) data['pred'] = all_pred data.to_excel('outliers.xlsx', columns=["pred",], header=False)
得到的结果
"D:\ProgramData\New Folder\python.exe" C:/Users/ZS/PycharmProjects/untitled/AS (23648, 6) Traceback (most recent call last): File "C:/Users/ZS/PycharmProjects/untitled/AS", line 16, in <module> ilf.fit(data[X_cols]) File "C:\Users\ZS\AppData\Roaming\Python\Python27\site-packages\sklearn\ensemble\iforest.py", line 162, in fit X = check_array(X, accept_sparse=['csc']) File "C:\Users\ZS\AppData\Roaming\Python\Python27\site-packages\sklearn\utils\validation.py", line 448, in check_array array = array.astype(np.float64) ValueError: could not convert string to float: ? Process finished with exit code 1