tensorflow2.3实现航空公司评论数据
导入包
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
显存自适应分配,查看tensorflow 的版本
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
tf.__version__
- ‘2.3.0’
读取数据
data = pd.read_csv('./dataset/Tweets.csv')
len(data)
- 14640 数据共有14640条
正面评价数据量大小
sum(data.airline_sentiment=='positive')
- 2363
数据集信息
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tweet_id 14640 non-null int64
1 airline_sentiment 14640 non-null object
2 airline_sentiment_confidence 14640 non-null float64
3 negativereason 9178 non-null object
4 negativereason_confidence 10522 non-null float64
5 airline 14640 non-null object
6 airline_sentiment_gold 40 non-null object
7 name 14640 non-null object
8 negativereason_gold 32 non-null object
9 retweet_count 14640 non-null int64
10 text 14640 non-null object
11 tweet_coord 1019 non-null object
12 tweet_created 14640 non-null object
13 tweet_location 9907 non-null object
14 user_timezone 9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB
取出数据中需要分析的列
data = data[['airline_sentiment', 'text']]
airline_sentiment列中的评价的种类和数量(正面负面中立)
data.airline_sentiment.unique()
data.airline_sentiment.value_counts()
array([‘neutral’, ‘positive’, ‘negative’], dtype=object)
negative 9178
neutral 3099
positive 2363
Name: airline_sentiment, dtype: int64
把正面负面评论份分别取出,为了样本的均匀分布,正负样本数量尽量一样大小
data_p = data[data['airline_sentiment']=='positive']
data_n = data[data['airline_sentiment']=='negative']
data_n = data_n.iloc[:len(data_p)]
len(data_n), len(data_p)
- (2363, 2363)
创建新的数据,利用pandas的concat方法上下合并,并转换成0,1增加到新数据列中
data = pd.concat([data_n, data_p])
data = data.sample(len(data))# 乱序,
data['review'] = (data.airline_sentiment =='positive').astype('int')
data
airline_sentiment text review
4668 positive @SouthwestAir I’ll stick with flying for free … 1
3304 negative @united worst airline in the world-8 delays, g… 0
8735 positive @JetBlue Thanks for having us hang out at Tamp… 1
1075 negative @united 2 Cancelled Flighted flights Late Flig… 0
437 positive @virginamerica awesome deals DAL-AUS for only … 1
… … … …
2095 negative @united worst customer service experience ever… 0
1478 negative @united that’s not good enough for all of us a… 0
1486 positive @united I was sincerely thanking the pilot of … 1
13013 positive @AmericanAir Well, you guys are totally kickin… 1
3645 negative @united i have talked to them…on standby for… 0
4726 rows × 3 columns
后面处理时只需要处理data[‘review’]
del data['airline_sentiment']
自定义函数利用正则法去掉特殊的字符
import re
token = re.compile('[A-Za-z]+|[!?,.()]') #编写一个正则取出A-Z a-z !?,.()
def reg_text(text):
new_text = token.findall(text) # 提取token种的字符
new_text = [word.lower() for word in new_text] # 转换成小写字母
return new_text
text这列应用自定义函数,去掉特殊符号
data['text'] = data.text.apply(reg_text)
data
预处理之后数据集的样式
text review
4668 [southwestair, i, ll, stick, with, flying, for… 1
3304 [united, worst, airline, in, the, world, delay… 0
8735 [jetblue, thanks, for, having, us, hang, out, … 1
1075 [united, cancelled, flighted, flights, late, f… 0
437 [virginamerica, awesome, deals, dal, aus, for,… 1
… … …
2095 [united, worst, customer, service, experience,… 0
1478 [united, that, s, not, good, enough, for, all,… 0
1486 [united, i, was, sincerely, thanking, the, pil… 1
13013 [americanair, well, , you, guys, are, totally… 1
3645 [united, i, have, talked, to, them, ., ., ., o… 0
4726 rows × 2 columns
用一个集合统计文本数据中的唯一单词和标点符号
word_set = set()
for text in data.text:
for word in text:
word_set.add(word)
把set集合转换为list,因为list中有index索引
word_list = list(word_set)
word_list.index('spending')
- 6400
构建单词索引
word_index = dict((word, word_list.index(word)+1) for word in word_list)
word_index
{‘unanswered’: 1,
‘last’: 2,
‘ja’: 3,
‘teach’: 4,
‘customers’: 5,
…
‘die’: 998,
‘sloppy’: 999,
‘survey’: 1000,
…
word_list.index(word)+1是为了让单词和数字对应从1开始,一般情况下索引是从0开始的
把text这列由字符型转换为数字
data_ok = data.text.apply(lambda x: [word_index.get(word, 0) for word in x])
data_ok
3206 [5969, 2525, 5827, 2377, 1123, 5563, 7036, 444…
10685 [4389, 5878, 4449, 6704, 4593, 3451, 4923, 454…
7791 [6717, 5759, 5152, 5146, 2363, 1306, 6971, 103…
1751 [5969, 4874, 3514, 3287, 3085, 175, 6740, 5152…
1139 [5969, 873, 4383, 7089, 2158, 2203, 910, 1088,…
…
3359 [5969, 716, 1650, 3911, 4307, 4834]
2033 [5969, 3911, 5697, 4829, 3911, 5488, 809, 1186…
2986 [5969, 6727, 2210, 4523, 3693, 4449, 5437, 438…
2121 [5969, 3106, 6889, 532, 6160, 3015, 809, 5827,…
5080 [1280, 5474, 3911, 2203, 4874, 4523, 3680, 678…
每一条评论的长度不一样
len(data_ok.iloc[2])
maxlen = max(len(x) for x in data_ok)
maxlen
- 22
- 40
最长的是40个字
因为每条评论长度不一样,我们需要填充成长度一致的向量,不够的用0填充,这就是word_index是从1开始的原因。所以单词的个数是
max_word = len(word_set) + 1
max_word
- 7101 7100个单词 + 填充的一个单词 = 7101个单词
data_ok = tf.keras.preprocessing.sequence.pad_sequences(data_ok.values, maxlen=maxlen )
data_ok.shape
- (4726, 40) 填充后数据的形状
label是data.review这一列
data.review.values
- array([1, 0, 1, …, 1, 1, 0])
建立模型
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_word, 50, input_length=maxlen))
model.add(tf.keras.layers.LSTM(64))#64是隐藏单元个数
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()
Embedding层是把文本映射成密集向量,参数(单词个数 , 密集向量长度 , 评论的长度)
Model: “sequential”
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 40, 50) 355050 _________________________________________________________________
lstm (LSTM) (None, 64) 29440
dense (Dense) (None, 1) 65
================================================================
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(data_ok, data.review, epochs=10, batch_size=128, validation_split=0.2 )
plt.plot(history.epoch, history.history.get('acc'), 'r', label='acc')
plt.plot(history.epoch, history.history.get('val_acc'), 'b', label='val_acc')
plt.legend()
plt.plot(history.epoch, history.history.get(‘loss’), ‘r’, label=‘loss’)
plt.plot(history.epoch, history.history.get(‘val_loss’), ‘b’, label=‘val_loss’)
plt.legend()
def train_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_word, 50, input_length=maxlen))
model.add(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.5))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='binary_crossentropy', metrics=['acc'])
return model
model2 = train_model()
history = model2.fit(data_ok, data.review, epochs=10, batch_size=128, validation_split=0.2 )
plt.plot(history.epoch, history.history.get('acc'), 'r', label='acc')
plt.plot(history.epoch, history.history.get('val_acc'), 'b', label='val_acc')
plt.legend()
plt.plot(history.epoch, history.history.get('loss'), 'r', label='loss')
plt.plot(history.epoch, history.history.get('val_loss'), 'b', label='val_loss')
plt.legend()