0 2022/4/25
1 泰迪杯数据挖掘
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from adtk.detector import GeneralizedESDTestAD
from matplotlib.font_manager import FontProperties
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import data_handle
def data_box():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据.csv')
box_1, box_2, box_3, box_4 = dataset['总有功功率(kw)'][0:35040], dataset['总有功功率(kw)'][35040:70016], dataset['总有功功率(kw)'][
70016:105131], \
dataset['总有功功率(kw)'][105131:128156]
plt.figure(figsize=(10, 5))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title('箱线图', fontsize=20)
labels = '2018', '2019', '2020', '2021'
plt.boxplot([box_1, box_2, box_3, box_4], labels=labels)
plt.show()
def data_time():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据.csv')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(dataset['总有功功率(kw)'], label='xx')
plt.ylabel('用电负荷/KW', fontsize=18)
plt.xlabel('时间', fontsize=18)
plt.show()
def data_bar():
dataset = pd.read_csv('./全部数据/附件2-行业日负荷数据.csv')
dataset['数据时间'] = pd.to_datetime(dataset['数据时间'], format="%Y-%m-%d %H:%M:%S")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
x = np.array(['大工业用电', '非普工业', '普通工业', '商业'])
data1 = dataset['数据时间'].loc[dataset['行业类型'] == "大工业用电"]
data2 = dataset['数据时间'].loc[dataset['行业类型'] == "非普工业"]
data3 = dataset['数据时间'].loc[dataset['行业类型'] == "普通工业"]
data4 = dataset['数据时间'].loc[dataset['行业类型'] == "商业"]
def hot_data():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据.csv')
dataset['数据时间'] = pd.to_datetime(dataset['数据时间'], format="%Y-%m-%d %H:%M:%S")
dataset = dataset[0:35040]
dataset.set_index('数据时间', inplace=True)
data = dataset.resample('d').sum()
k = 0
m = 0
a = np.ones((12, 30))
for i in range(360):
if k < 30:
a[m][k] = data['总有功功率(kw)'][i]
k = k + 1
else:
k = 0
m = m + 1
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(9, 9))
sns.heatmap(a)
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
y = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5]
plt.yticks(y, labels=labels)
x = []
x_labels = []
for i in range(30):
x.append(i + 0.5)
x_labels.append(i + 1)
plt.xticks(x, labels=x_labels)
ax.set_title('2018年每日负荷热力图', fontsize=18)
ax.set_ylabel('T/month', fontsize=18)
ax.set_xlabel('T/day', fontsize=18)
plt.show()
print(a)
if __name__ == '__main__':
hot_data()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, mean_squared_error
import tensorflow as tf
from tensorflow.keras import Sequential, layers, utils
import warnings
def create_new_dataset(dataset, seq_len=12):
'''
基于原始数据集构造新的序列特征数据集
Params:
dataset : 原始数据集
seq_len : 序列长度(时间跨度)
Returns:
X, y
'''
X = []
y = []
scaler = MinMaxScaler()
scaler.fit(dataset)
dataset = scaler.transform(dataset)
val_data_len = len(dataset)//10
validata = dataset[len(dataset) - val_data_len - seq_len-1:]
traindata = dataset[:len(dataset) - val_data_len]
start = 0
end = traindata.shape[0] - 2*seq_len
for i in range(start, end):
sample = traindata[i: i + seq_len]
label = traindata[i + seq_len: i+2*seq_len]
X.append(sample)
y.append(label)
return np.array(X), np.reshape(np.array(y),(len(y), SEQ_LEN)), np.array(validata)
def split_dataset(X, y, train_ratio=0.8):
'''基于X和y,切分为train和test
Params:
X : 特征数据集
y : 标签数据集
train_ratio : 训练集占X的比例
Returns:
X_train, X_test, y_train, y_test
'''
X_len = len(X)
train_data_len = int(X_len * train_ratio)
X_train = X[:train_data_len]
y_train = y[:train_data_len]
X_test = X[train_data_len:]
y_test = y[train_data_len:]
return X_train, X_test, y_train, y_test
def create_batch_data(X, y, batch_size=32, data_type=1):
'''基于训练集和测试集,创建批数据
Params:
X : 特征数据集
y : 标签数据集
batch_size : batch的大小,即一个数据块里面有几个样本
data_type : 数据集类型(测试集表示1,训练集表示2)
Returns:
train_batch_data 或 test_batch_data
'''
if data_type == 1:
dataset = tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y)))
test_batch_data = dataset.batch(batch_size)
return test_batch_data
else:
dataset = tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y)))
train_batch_data = dataset.cache().shuffle(1000).batch(batch_size)
return train_batch_data
def build_model():
model = Sequential([
layers.LSTM(32, input_shape=(SEQ_LEN, 1), return_sequences=True),
layers.Dropout(0.4),
layers.LSTM(units=32, return_sequences=False),
layers.Dense(SEQ_LEN)
])
model.compile(optimizer='adam', loss="mse")
return model
def train_model(model, train_batch_dataset, test_batch_dataset):
file_path = './best_checkpoint.hdf5'
history = model.fit(train_batch_dataset,
epochs=10,
validation_data=test_batch_dataset
)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test[:,0], y_pred[:,0])
print("mes", mse)
print("MAE", mae)
print("r2_score", r2)
plt.figure(figsize=(16, 8))
plt.plot(y_test[0,:], label='Ture label')
plt.plot(y_pred[0,:], label='Pred label')
plt.title("True vs Pred")
plt.legend(loc='best')
plt.show()
return
def predict_next(model, sample, epoch=36):
temp1 = model.predict(sample.reshape(1, SEQ_LEN, 1))
return temp1.reshape(SEQ_LEN,1)
if __name__ == '__main__':
warnings.filterwarnings('ignore')
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据.csv')
dataset['数据时间'] = pd.to_datetime(dataset['数据时间'], format="%Y-%m-%d %H:%M:%S")
dataset.index = dataset['数据时间']
dataset.drop(columns=['数据时间'], axis=1, inplace=True)
dataset.columns = ['DOM_kW']
SEQ_LEN = 36
X, y, valid_data = create_new_dataset(dataset.values, seq_len=SEQ_LEN)
X_train, X_test, y_train, y_test = split_dataset(X, y)
train_batch_dataset = create_batch_data(X_train, y_train, data_type=0)
test_batch_dataset = create_batch_data(X_test, y_test, data_type=1)
train_model(build_model(), train_batch_dataset, test_batch_dataset)
model = build_model()
model.load_weights(r'./best_checkpoint.hdf5')
pred = predict_next(model, sample=X_test[-1], epoch=1)
plt.figure(figsize=(12,6))
plt.plot(pred, color='red', label='Prediction')
plt.plot(valid_data[:36], color='blue', label='Truth')
plt.xlabel("Epochs")
plt.ylabel('Value')
plt.legend(loc='best')
plt.show()
0 2022/4/26
1 数据挖掘
def pearson_data():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
index = ['month', 'day', 'hour', 'minute', 'dayofweek', 'dayofyear', '最高温度', '最低温度', '白天最低风力', '白天最高风力',
'夜晚最低风力', '夜晚最高风力', '最好天气', '最坏天气', 'holiday']
index_labels = ['month', 'day', 'hour', 'minute', 'dofwe', 'dofye', 'maxte', 'minte', 'minwd', 'maxwd',
'minwn', 'maxwn', 'bewe', 'bawe', 'holiday']
r_data = np.ones((15, 15))
for i in range(len(index)):
for j in range(len(index)):
r = pearsonr(dataset[index[i]], dataset[index[j]])
r_data[i][j] = r[0]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(r_data, cmap='YlGnBu')
x = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5]
plt.xticks(x, labels=index_labels)
plt.yticks(x, labels=index_labels)
ax.set_title('特征相关系数', fontsize=18)
plt.show()
def vmd_data():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv', usecols=[0])
dataset1 = pd.read_csv('./全部数据/VMDban-1.csv')
dataset2 = pd.read_csv('./全部数据/VMDban-2.csv')
dataset3 = pd.read_csv('./全部数据/VMDban-3.csv')
dataset4 = pd.read_csv('./全部数据/VMDban-4.csv')
dataset5 = pd.read_csv('./全部数据/VMDban-5.csv')
dataset6 = pd.read_csv('./全部数据/VMDban-6.csv')
dataset7 = dataset['总有功功率(kw)'] - dataset1['v1'] - dataset2['v2'] - dataset3['v3'] - dataset4['v4'] - dataset5['v5'] - dataset6['v6']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.subplot(7, 1, 1)
plt.plot(dataset1[0:6000], label='u1')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 2)
plt.plot(dataset2[0:6000], label='u2')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 3)
plt.plot(dataset3[0:6000], label='u3')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 4)
plt.plot(dataset4[0:6000], label='u4')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 5)
plt.plot(dataset5[0:6000], label='u5')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 6)
plt.plot(dataset6[0:6000], label='u6')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.subplot(7, 1, 7)
plt.plot(dataset7[0:6000], label='残差')
plt.legend(loc='upper right')
plt.ylabel('KW', fontsize=12)
plt.show()
def data_step_per(step):
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
dataset['数据时间'] = pd.to_datetime(dataset['数据时间'], format="%Y-%m-%d %H:%M:%S")
dataset.set_index('数据时间', inplace=True)
data = dataset.resample('d').sum()
data_person = np.ones((step, step))
for i in range(step):
dataset1 = data[0 + i:1310 + i]
for j in range(step):
dataset2 = data[0 + j:1310 + j]
r = pearsonr(dataset1['总有功功率(kw)'], dataset2['总有功功率(kw)'])
data_person[i][j] = r[0]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(data_person, cmap='YlGnBu',square=True, annot=True)
plt.show()
def season_data():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
decomposition = sm.tsa.seasonal_decompose(dataset['总有功功率(kw)'].values, model='addictive', period=96 * 120)
season = decomposition.seasonal
plt.plot(season)
plt.show()
0 2022/4/27
1 数据挖掘
def data_step_per(step):
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
dataset['数据时间'] = pd.to_datetime(dataset['数据时间'], format="%Y-%m-%d %H:%M:%S")
dataset.set_index('数据时间', inplace=True)
data = dataset.resample('d').sum()
data_person = np.ones((step, step))
for i in range(step):
dataset1 = data[0 + i:1310 + i]
for j in range(step):
dataset2 = data[0 + j:1310 + j]
r = pearsonr(dataset1['总有功功率(kw)'], dataset2['总有功功率(kw)'])
data_person[i][j] = r[0]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(data_person, cmap='YlGnBu',square=True, annot=True)
plt.show()
def season_data():
dataset = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
decomposition = sm.tsa.seasonal_decompose(dataset['总有功功率(kw)'].values, model='addictive', period=96 * 120)
season = decomposition.seasonal
plt.plot(season)
plt.show()
def seasonal(df):
mean_load = df["总有功功率(kw)"].mean()
dataset = df.copy()
df.drop(range(75745, 75745 + 96), axis=0, inplace=True)
res = []
df["year"] = df["数据时间"].dt.year
year_group = df.groupby("year")
group_2018 = year_group.get_group(2018)
group_2019 = year_group.get_group(2019)
group_2020 = year_group.get_group(2020)
group_2021 = year_group.get_group(2021)
for i in range(35040):
if i <= 23327:
a = (group_2018.iloc[i][0] +
group_2019.iloc[i][0] +
group_2020.iloc[i][0] +
group_2021.iloc[i][0]) / 4
else:
a = (group_2018.iloc[i][0] + group_2019.iloc[i][0] + group_2020.iloc[i][0]) / 3
res.append(a / mean_load)
res_2020 = res.copy()
for i in range(96):
res_2020.insert(0, 5664)
res = res + res + res_2020 + res[0:len(group_2021)]
dataset["总有功功率(kw)"] /= res
return dataset
def delete_personal():
df = pd.read_csv('./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday.csv')
df['数据时间'] = pd.to_datetime(df['数据时间'], format="%Y-%m-%d %H:%M:%S")
df2 = df.copy()
mean_load = df["总有功功率(kw)"].mean()
df2.loc[:, '比例'] = df['month']
a = []
for index, row in df2.iterrows():
data = df.loc[(df['month'] == row['month']) & (df['day'] == row['day']) & (df['minute'] == row['minute']) & (
df['hour'] == row['hour'])]
x = data['总有功功率(kw)'].sum() / data.shape[0]
a.append(x / mean_load)
df2['总有功功率(kw)'][index] = df['总有功功率(kw)'][index] / (x / mean_load)
df2['比例'][index] = x / mean_load
df2.to_csv("./全部数据/附件1-区域15分钟负荷数据_handle_weather_holiday_personal_2.csv", index=None)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(df2['总有功功率(kw)'], label='xx')
plt.ylabel('用电负荷/KW', fontsize=18)
plt.xlabel('时间/年/月', fontsize=18)
labels = [0, 50000, 100000, 150000, 200000, 250000, 300000, 350000]
y = [0, 50000, 100000, 150000, 200000, 250000, 300000, 350000]
plt.yticks(y, labels=labels)
x_labels = ['2018/1', '2018/6', '2019/1', '2019/6', '2020/1', '2020/6', '2021/1']
x = [0, 20000, 40000, 58000, 74000, 90000, 109000]
plt.xticks(x, labels=x_labels)
plt.show()
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = ['MicroSoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_csv('./全部数据/附件2-行业日负荷数据.csv')
df = df.loc[df['行业类型'] == '普通工业']
x = df['数据时间']
y = df['有功功率最小值(kw)']
n = len(y)
Sk = np.zeros(n)
UFk = np.zeros(n)
s = 0
for i in range(1, n):
for j in range(0, i):
if y.iloc[i] > y.iloc[j]:
s += 1
Sk[i] = s
E = (i+1)*(i/4)
Var = (i+1)*i*(2*(i+1)+5)/72
UFk[i] = (Sk[i] - E)/np.sqrt(Var)
y2 = np.zeros(n)
Sk2 = np.zeros(n)
UBk = np.zeros(n)
s = 0
y2 = y[::-1]
for i in range(1, n):
for j in range(0, i):
if y2.iloc[i] > y2.iloc[j]:
s += 1
Sk2[i] = s
E = (i+1)*(i/4)
Var = (i+1)*i*(2*(i+1)+5)/72
UBk[i] = -(Sk2[i] - E)/np.sqrt(Var)
UBk2 = UBk[::-1]
plt.figure(figsize=(18, 18))
plt.plot(range(973), UFk, label='UF')
plt.plot(range(973), UBk2, label='UB')
plt.ylabel('Mann-Kendall检验值')
plt.xlabel('年份 Year')
x_lim = plt.xlim()
plt.plot(x_lim,[-1.96,-1.96],':',color='black',label='5%显著水平')
plt.plot(x_lim, [0,0],'--',color='black')
plt.plot(x_lim,[1.96,1.96],':',color='black')
plt.legend(bbox_to_anchor=(0.75, 0.07), facecolor='w',frameon=False)
plt.text(0, -1.6, '突变点检验')
plt.savefig("./图片/MK检验.png")
plt.show()