#!/usr/bin/env python# coding: utf-8# author: LIU, Jing# In[1]:# Pre-Processingimport string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
# Modelingimport statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import*from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from gensim import corpora, models, similarities
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
# For the first-time user, you should download these two pre-trained tokenizer#nltk.download('punkt') #nltk.download('stopwords')import numpy as np
import pandas as pd
# In[2]:
df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv',index_col =0)
sentiment_label = df["Recommended IND"]
df.head()# In[157]:# Pre-process# choose review text
pdtextpreprocess = df[["Title","Review Text","Rating"]]
pdtextpreprocess['index']= pdtextpreprocess.index
documents =[str(m)+" "+str(n)for m,n inzip(pdtextpreprocess["Title"],pdtextpreprocess["Review Text"])]# add title as part of review text# In[ ]:# In[5]:
texts_tokenized =[[word.lower()for word in word_tokenize(str(document))]for document in documents]#text tokenized and transformed to lowercase
stop_words =set(stopwords.words('english'))
texts_filtered_stopwords =[[word for word in document ifnot word in stop_words]for document in texts_tokenized]#move stop_words# puncuations
english_punctuations =list(string.punctuation)
texts_filtered =[[word for word in document ifnot word in english_punctuations]for document in texts_filtered_stopwords]#move english_punctuations
stemmer = PorterStemmer()#stemmer =LancasterStemmer()
texts_stemmed =[[stemmer.stem(word)for word in docment]for docment in texts_filtered]
all_stems =sum(texts_stemmed,[])
stems_once =set(stem for stem inset(all_stems)if all_stems.count(stem)==1)#texts that appear only once
texts =[[stem for stem in text if stem notin stems_once]for text in texts_stemmed]#texts that appear more than once# In[6]:from gensim.corpora import Dictionary
dictionary = Dictionary(texts)# fit dictionary
corpus =[dictionary.doc2bow(line)for line in texts]# convert corpus to BoW format
tfidf = models.TfidfModel(corpus)#calculate tf-idf
corpus_tfidf = tfidf[corpus]#use tf-idfs to represent documents# ## Build the LSA and LDA topic models for the reviews. # In[13]:#LSA modekfrom gensim.models.lsimodel import LsiModel
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary)
topics_lsi=lsi.show_topics()for tpc in topics_lsi:print(tpc)# In[26]:#LDA modelfrom gensim.models.ldamodel import LdaModel
ldamodel = LdaModel(corpus, id2word=dictionary)
topics_lda=ldamodel.show_topics()for tpc in topics_lda:print(tpc)# ## Predict sentiment# # (Recommended / Not recommended) of reviews using their bag-of-topics features by fitting a logistic regression model.# In[28]:from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
# In[65]:# GET FEATURES
df_lr=pdtextpreprocess.copy()
df_lr['doc']=texts
df_lr['Recommendend IND']=sentiment_label
df_lr
training_data, testing_data = train_test_split(df_lr,random_state =2000)
training_data=training_data.set_index('index')
testing_data=testing_data.set_index('index')# In[66]:
testing_data
# In[46]:
Y_train=training_data['Recommendend IND'].values
Y_test=testing_data['Recommendend IND'].values
# In[69]:
df_train=training_data[['Title','Review Text','Rating','doc']]
df_test=testing_data[['Title','Review Text','Rating','doc']]# In[90]:# Pre-process# choose review text
documents_train =[str(m)+" "+str(n)for m,n inzip(df_train["Title"],df_train["Review Text"])]
documents_test =[str(m)+" "+str(n)for m,n inzip(df_test["Title"],df_test["Review Text"])]# In[93]:
texts_tokenized_train =[[word_train.lower()for word_train in word_tokenize(str(document_train))]for document_train in documents_train]#text tokenized and transformed to lowercase
texts_tokenized_test =[[word_test.lower()for word_test in word_tokenize(str(document_test))]for document_test in documents_test]
stop_words =set(stopwords.words('english'))
texts_filtered_stopwords_train =[[word for word in document_train ifnot word in stop_words]for document_train in texts_tokenized_train]#move stop_words
texts_filtered_stopwords_test =[[word for word in document_test ifnot word in stop_words]for document_test in texts_tokenized_test]# puncuations
english_punctuations =list(string.punctuation)
texts_filtered_train =[[word for word in document_train ifnot word in english_punctuations]for document_train in texts_filtered_stopwords_train]#move english_punctuations
texts_filtered_test =[[word for word in document_test ifnot word in english_punctuations]for document_test in texts_filtered_stopwords_test]
stemmer = PorterStemmer()#stemmer =LancasterStemmer()
texts_stemmed_train =[[stemmer.stem(word)for word in docment_train]for docment_train in texts_filtered_train]
texts_stemmed_test =[[stemmer.stem(word)for word in docment_test]for docment_test in texts_filtered_test]
all_stems_train =sum(texts_stemmed_train,[])
all_stems_test =sum(texts_stemmed_test,[])
stems_once_train =set(stem for stem inset(all_stems_train)if all_stems_train.count(stem)==1)#texts that appear only once
stems_once_test =set(stem for stem inset(all_stems_test)if all_stems_test.count(stem)==1)
texts_train =[[stem for stem in text_train if stem notin stems_once_train]for text_train in texts_stemmed_train]#texts that appear more than once
texts_test =[[stem for stem in text_test if stem notin stems_once_test]for text_test in texts_stemmed_test]# In[ ]:# In[132]:defextract_features(training_data,testing_data,type="binary"):if"binary"intype:# BINARY FEATURE REPRESENTATION
cv= CountVectorizer(binary=True)
cv.fit_transform(training_data)
train_feature_set=cv.transform(training_data)
test_feature_set=cv.transform(testing_data)return train_feature_set,test_feature_set,cv
elif"counts"intype:# COUNT BASED FEATURE REPRESENTATION
cv= CountVectorizer(binary=False)
cv.fit_transform(training_data.values)
train_feature_set=cv.transform(training_data)
test_feature_set=cv.transform(testing_data)return train_feature_set,test_feature_set,cv
else:# TF-IDF BASED FEATURE REPRESENTATION
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer.fit_transform(training_data)
train_feature_set=tfidf_vectorizer.transform(training_data)
test_feature_set=tfidf_vectorizer.transform(testing_data)return train_feature_set,test_feature_set,tfidf_vectorizer
# In[133]:
X_train,X_test,feature_transformer=extract_features(documents_train,documents_test,type='feature_rep')# In[143]:
scikit_log_reg = LogisticRegression(solver='liblinear',max_iter=10)
lr_model=scikit_log_reg.fit(X_train,Y_train)# In[144]:
preds=lr_model.predict(X_test)# In[159]:
actual_index=df_test.index
actual_index
df_test['pred_Recommend']=preds
df_test['actual_Recommend']=sentiment_label[actual_index]# In[161]:
df_test
# ## Calculate the prediction error# (Defined as the percentage of incorrect predictions)# In[165]:from sklearn.metrics import accuracy_score
y_pred=df_test['pred_Recommend']
y_actual=df_test['actual_Recommend']
accuracy_score(y_pred,y_actual)