学习nlp已经有一段日子了,也是反复看一些内容,遗忘真是可怕,想着去Kaggle练练手,也是新手入门Kaggle文本分类,大神们请略过吧。对数据进行了简单的常规处理,用了个2层的bi-lstm的简单模型,之后再改进改进用其他模型试验下效果。代码如下:
import pandas as pd
from keras.layers import Dense,LSTM,Bidirectional,Embedding
from keras.models import Sequential
import keras.preprocessing as preprocessing
from nltk.corpus import stopwords
from keras.utils.np_utils import to_categorical
import nltk
import matplotlib.pyplot as plt
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from nltk import WordNetLemmatizer,word_tokenize
stoplist = stopwords.words('english')
data_train = pd.read_csv(r'D:\Kaggle\train.tsv',sep='\t')
data_test = pd.read_csv(r'D:\Kaggle\test.tsv',sep='\t')
sub = pd.read_csv(r'D:\Kaggle\sampleSubmission.csv')
data_train_X = data_train.Phrase.values
X_test = list(data_test.Phrase.values)
data_train_Y = list(data_train.Sentiment.values)
lemmat = WordNetLemmatizer()