import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit,cross_validate,learning_curve
import numpy as np
'''
1、词向量化文本特征 + 普通特征 + DeepFM/FM/FFM构造的特征 输入给神经网络(全连接);理解神经网络与特征的关系
2、实现一个简单的gan网络,再实现seim_gan的实现与评估
3、理解神经网络
4、神经网络模型的保存与载入
5、with open 打开每一条记录并做一些处理 pandas的chunksize
6、pyspark的RDD数据格式与Dataframe的转换
7、迁移学习、强化学习、深度时空神经网络、
8、python调用包,在每个分布式上面,将spark读入的rdd处理为dataframe,调用python工具处理输出,包装为RDD,汇总,放入hive
9、GBDT生成新特征 + 神经网络的embeeding+FM+LSTM+FC层 (BN,early_stopping)+线性模型(LR)
'''
if __name__ == '__main__':
import pandas as pd
import numpy as np
data = pd.read_csv('model_sample.csv',encoding='UTF-8')
data = data.fillna(0)
print(data.sample(3))
#data['word'].apply(lambda x:set(x))
#print(data['word'].apply(lambda x:list(set(x))))
from sklearn import preprocessing
change = preprocessing.LabelEncoder()
data['new_word'] = change.fit_transform(data['word'])
#print([x for x in change.inverse_transform(data['new_word'])][:2])
'''基于词袋训练doc2bow出关于词语出现频率的词向量,再结合tfidf训练词向量重要性向量'''
from gensim.corpora import Dictionary
from gensim import corpora,models
segmented = [[x] for x in data['word'].unique()]
word_dicts = corpora.Dictionary(segmented)
print(word_dicts)
#corpus = [word_dicts.d]
'''日志信息输出'''
import logging
import os.path
import sys
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import gensim
s=[]
for i in range(len(segmented)):
s.append(" ".join(map(str,segmented[i])))
pd.DataFrame(s).to_csv('segmented.csv',index=0,header=True,encoding='utf-8')
word_sentence = LineSentence('segmented.csv')
'''word2vec'''
word_model_word2vec = gensim.models.Word2Vec(word_sentence
,hs=1,size=20,window=5,min_count=1,iter=10)
word_model_word2vec.save('segmented_word2vec')
for i in word_model_word2vec.wv.index2word:
print(word_model_word2vec.wv[i])
'''fasttext'''
word_model_fasttext = gensim.models.FastText(word_sentence,word_ngrams=2
, hs=1, size=20, window=5, min_count=1, iter=10)
word_model_fasttext.save('segmented_fasttext')
for i in word_model_fasttext.wv.index2word:
print(word_model_fasttext.wv[i])
with open('segmented.csv','r',encoding='utf-8') as f:
for line in f :
print(line)
if __name__ == '__main__':
from sklearn.preprocessing import LabelEncoder
data = data.drop(['word'],axis=1)
print(data.sample(3))
'''将所有特征作为神经网络的输入'''
import tensorflow as tf
import keras
from keras.layers import Layer, Dense, Dropout, Input
from keras import Model, activations
from keras.optimizers import Adam
import keras.backend as K
from sklearn.datasets import load_breast_cancer
class FM(Layer):
def __init__(self, output_dim=30, activation="relu", **kwargs):
self.output_dim = output_dim
self.activate = activations.get(activation)
super(FM, self).__init__(**kwargs)
def build(self, input_shape):
self.wight = self.add_weight(name='wight',
shape=(input_shape[1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
self.bias = self.add_weight(name='bias',
shape=(self.output_dim,),
initializer='zeros',
trainable=True)
self.kernel = self.add_weight(name='kernel',
shape=(input_shape[1], self.output_dim),
initializer='glorot_uniform',
trainable=True)
super(FM, self).build(input_shape)
def call(self, x):
feature = K.dot(x, self.wight) + self.bias
a = K.pow(K.dot(x, self.kernel), 2)
b = K.dot(x, K.pow(self.kernel, 2))
cross = K.mean(a - b, 1, keepdims=True) * 0.5
cross = K.repeat_elements(K.reshape(cross, (-1, 1)), self.output_dim, axis=-1)
return self.activate(feature + cross)
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_dim)
data = load_breast_cancer()['data']
target = load_breast_cancer()['target']
data = pd.read_csv('model_sample.csv',encoding='UTF-8')
data_x = data.drop(['y','word','user_id'],axis=1)
data_y = data['y']
#print(data.shape)
K.clear_session()
inputs = Input(shape=(199,))
out = FM(50)(inputs)
out = Dense(100,activation='relu')(inputs)
out = keras.layers.BatchNormalization()(inputs)
out = Dense(200, activation='relu')(inputs)
out = keras.layers.BatchNormalization()(inputs)
out = Dense(3,activation='softmax')(out)
model = Model(inputs=inputs,outputs=out)
model.compile(loss='categorical_crossentropy',optimizer=Adam(0.00001),metrics=['accuracy'])
model.summary()
data_y = keras.utils.to_categorical([[x] for x in data_y],num_classes=3)
#print([[x] for x in data_y])
#print(np.random.randint(10,size=(20,1)))
#model.fit(data_x,data_y,batch_size=32,epochs=3,validation_split=0.1)
#model.evaluate(data_x,data_y,batch_size=1)
if __name__ == '__main__':
'''embedding层的构建与保存再输出
keras.preprocessing.text.not_hot
keras.preprocessing.sequenct.pad_sequences
model.add(Dense(,,name='need_train')'''
from keras import preprocessing
encoded_size=10
encoded_data_word = [keras.preprocessing.text.one_hot(d,encoded_size) for d in data['word']]
#print(encoded_data_word)
max_length=4
padded_data_word = keras.preprocessing.sequence.pad_sequences(encoded_data_word,max_length,padding='post')
#print(padded_data_word)
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=encoded_size,output_dim=8,input_length=max_length,name='embedding_layer'))
model.add(keras.layers.Flatten(name='Flatten_name'))
model.add(Dense(3,activation='sigmoid'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])
import numpy as np
data = np.random.random((10000,4))
labels = np.random.randint(3,size=(10000,1))
#print(labels)
labels = keras.utils.to_categorical(labels,num_classes=3)
model.fit(data,labels,epochs=2,batch_size=16)
get_embedding_layer = Model(inputs=model.input,
outputs = model.get_layer('embedding_layer').output)
print(data)
print(get_embedding_layer.predict(data))
if __name__ == '__main__':
'''capsule胶囊网络 https://www.jianshu.com/p/271d5f1f0e25'''
'''Capsule是深度学习之父hinton在2017年提出来的一个较为轰动的网络结构。
capsule这个结构主要的特点是:Vector in Vector out——向量进,向量出,
而普通的神经元(Neuron)是Vector in Scalar out——向量进,标量出。
capsule输出的向量比Neuron输出的标量表达出更丰富的特征。'''
import numpy as np
x = np.random.random((10000,200))
flag = np.random.randint(2,size=(10000,1))
if __name__ == '__main__':
'''对抗神经网络简介https://www.jianshu.com/p/dd3565c8ffd2'''
'''https://github.com/raghav64/SemiSuper_GAN/blob/master/SSGAN.pyhttps://github.com/raghav64/SemiSuper_GAN/blob/master/SSGAN.py'''
'''非图像数据的对抗神经网络GAN的使用'''
from keras import Sequential
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras import losses
from keras.utils import to_categorical
import keras.backend as K
from keras import initializers
import matplotlib.pyplot as plt
data = pd.read_csv('model_sample.csv',encoding='UTF-8')
data = data.fillna(0)
X = data.drop(['user_id','y','word'],axis=1)
Y = data['y']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)
#print(x_train.shape)
def build_generator():
model = Sequential()
model.add(Dense(40, activation='relu', input_dim=x_train.shape[1]))
model.add(LeakyReLU(0.2))
model.add(Dense(20, activation='relu'))
model.add(LeakyReLU(0.2))
model.add(Dense(20, activation='relu'))
model.add(LeakyReLU(0.2))
model.add(Dense(x_train.shape[1], activation='tanh'))
# opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.2)
# model.compile(optimizer=opt,loss='categorical_crossentropy')
#model.summary()
noise = Input(shape=(x_train.shape[1],))
img = model(noise)
return Model(noise, img)
def build_discriminator():
model = Sequential()
model.add(Dense(50, activation='relu', input_dim=x_train.shape[1]))
model.add(LeakyReLU(0.2))
#model.add(Dropout(dropout))
model.add(FM(200)) #添加因子分解机层
model.add(LeakyReLU(0.2))
model.add(Dense(200, activation='relu'))
model.add(LeakyReLU(0.2))
#model.add(Dropout(dropout))
model.add(Dense(300, activation='relu'))
model.add(LeakyReLU(0.2))
#model.add(Dropout(dropout))
model.add(Dense(3, activation='softmax'))
# opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.2)
# model.compile(optimizer=opt,loss='categorical_crossentropy',metrics=['accuracy'])
#model.summary()
img = Input(shape=(x_train.shape[1],))
features = model(img)
valid = Dense(1, activation="softmax")(features)
label = Dense(len(set(y_train)) + 1, activation="softmax")(features)
#print(valid,label)
return Model(img, [valid, label])
# model.add(Flatten())
# opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.2)
# model.compile(optimizer=opt,loss='categorical_crossentropy',metrics=['accuracy'])
noise = Input(shape=(x_train.shape[1],))
generator = build_generator()
discriminator = build_discriminator()
img = generator(noise)
print(img)
discriminator.trainable = False
valid, _ = discriminator(img)
combined = Model(noise, valid)
combined.compile(loss=['binary_crossentropy'], optimizer=Adam(),metrics=['accuracy'])
combined.summary()
epochs = 6
batch_size = 12
half_batch = batch_size // 2
cw1 = {0: 1, 1: 1}
cw2 = {i: len(set(y_train)) / half_batch for i in range(len(set(y_train)))}
cw2[len(set(y_train))] = 1 / half_batch
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))
for epoch in range(epochs):
idx = np.random.randint(0, x_train.shape[0], batch_size)
#print(idx)
#print(x_train.iloc[idx,:])
imgs = x_train.iloc[idx,:]
noise = np.random.normal(0,1,(batch_size, x_train.shape[1]))
generator = build_generator()
gen_imgs = generator.predict(noise)
# One-hot encoding of labels
labels = to_categorical(y_train[idx].fillna(0), num_classes=len(set(y_train)) + 1)
#print(labels)
fake_labels = to_categorical(np.full((batch_size, 1), len(set(y_train))), num_classes=len(set(y_train)) + 1)
# Train the discriminator
discriminator = build_discriminator()
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.2)
discriminator.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
y_vaild_binary = to_categorical(LabelEncoder().fit_transform(valid), 3)
y_fake_binary = to_categorical(LabelEncoder().fit_transform(fake), 3)
#print(y_vaild_binary,valid,labels,fake,fake_labels)
d_loss_real = discriminator.train_on_batch(imgs, [valid, labels], class_weight=[cw1, cw2])
d_loss_fake = discriminator.train_on_batch(gen_imgs, [fake, fake_labels], class_weight=[cw1, cw2])
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
g_loss = combined.train_on_batch(noise, valid, class_weight=[cw1, cw2])
#print(combined.evaluate(noise, valid, batch_size=batch_size))
#print(combined.metrics_names)
#print(epoch,d_loss,g_loss)
#print(epoch,d_loss[0],100*d_loss[4],g_loss[0],100*g_loss[1])
#print("%d [D loss: %f, acc: %.2f%%, op_acc: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[3], 100 * d_loss[4], g_loss))
#print(combined.predict(x_test))
#print(generator.predict(x_test))
#print(discriminator.predict(x_test)[0])
#print(discriminator.predict(x_test)[1])
#print(discriminator.predict(x_train)[1])
#seim_predict = discriminator.predict(x_test)[1]
#print(np.array(y_train))
#for i in range(len(discriminator.predict(x_test)[1])):
#print(i,np.array(y_test)[i],discriminator.predict(x_test)[1][i])
if __name__ == '__main__':
'''简单提取一层神经网络,依据神经网络的特点获取特征'''
x = np.random.random((20000,150))
y = np.random.randint(5,size=(20000,1))
import keras.backend as K
K.clear_session()
inputs = Input(shape=(150,))
out = FM(100,name='fm_layer')(inputs)
out = Dense(100,activation='relu',name='layer_1')(inputs)
out = keras.layers.BatchNormalization()(inputs)
out = Dense(200, activation='relu')(inputs)
out = keras.layers.BatchNormalization()(inputs)
out = Dense(5,activation='softmax')(out)
model = Model(inputs=inputs,outputs=out)
model.compile(optimizer=Adam(0.00001),loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()
y = keras.utils.to_categorical(y,num_classes=5)
#model.fit(x,y,batch_size=12,epochs=1)
#print(model.predict(x))
model = Sequential()
#model.add(Dense(FM(10,name='fm_layer',input_shape=(150,))))
model.add(Dense(100,activation='relu',name='layer_1',input_dim=150))
model.add(Dense(5,activation='softmax'))
model.compile(optimizer=Adam(0.0001),loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x, y, batch_size=12, epochs=1)
get_fm_layer = Model(inputs=model.input,
outputs = model.get_layer('layer_1').output)
print(x)
print(get_fm_layer.predict(x))
if __name__ == '__main__':
'''GBDT生成特征'''
'''https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#example-ensemble-plot-feature-transformation-py'''
'''https://blog.youkuaiyun.com/shine19930820/article/details/71713680#generate-features-for-ffm'''
import pandas as pd
import numpy as np
x = np.random.random((20000,10))
y = np.random.randint(3,size=(20000,1))
#print(y.ravel())
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
gbdt = GradientBoostingClassifier(max_depth=3,min_samples_leaf=10,n_estimators=100,learning_rate=0.2,random_state=2)
gbdt.fit(x,y.ravel())
gbdt_enc = OneHotEncoder()
#print(x)
print(gbdt.apply(x)[:,:,0])
print(np.array(gbdt.apply(x)[:,:,0]).shape)
gbdt_enc.fit(gbdt.apply(x)[:,:,0])
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(np.array(gbdt.apply(x)[:,:,0]),y.ravel())
lr_predict = lr.predict(np.array(gbdt.apply(x)[:,:,0]))
from sklearn.metrics import accuracy_score
print(accuracy_score(lr_predict,y.ravel()))
from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, GaussianNoise
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras import losses
from keras.utils import to_categorical
from keras.layers import LSTM,Embedding
import keras.backend as K
from keras import initializers
import matplotlib.pyplot as plt
import tensorflow as tf
inputs = Input(shape=(100,))
keras_model = Embedding(input_dim=100,output_dim=200)(inputs)
keras_model = LSTM(48,activation='relu')(keras_model)
keras_model = Dense(48,activation='relu')(keras_model)
out_model = Dense(3,activation='relu')(keras_model)
model = Model(inputs=inputs,outputs=out_model)
model.summary()
model.compile(loss='categorical_crossentropy',optimizer=Adam(0.00001),metrics=['accuracy'])
keras_y = to_categorical(y)
model.fit(np.array(gbdt.apply(x)[:,:,0]),keras_y,batch_size=12,epochs=20)
1.20190818
最新推荐文章于 2022-11-22 17:51:53 发布