用处:Textming ,构建VSM
第一种方法:
# environment
library(tm)
library(proxy)
library(dplyr)
doc <- c( "The sky is blue.", "The sun is bright today.",
"The sun in the sky is bright.", "We can see the shining sun, the bright sun." )
# create term frequency matrix using functions from tm library
doc_corpus <- Corpus( VectorSource(doc) )
control_list <- list(removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE)
tdm <- TermDocumentMatrix(doc_corpus, control = control_list)
# print
( tf <- as.matrix(tdm) )
# idf
( idf <- log( ncol(tf) / ( 1 + rowSums(tf != 0) ) ) )
# diagonal matrix
( idf <- diag(idf) )
# tf-idf matrix
tf_idf <- crossprod(tf, idf)
colnames(tf_idf) <- rownames(tf)
tf_idf
# Note that normalization is computed "row-wise"
tf_idf / sqrt( rowSums( tf_idf^2 ) )
第二种方法:
tfidf=function(mat){
tf <- mat
id=function(col){sum(!col==0)}
idf <- log(nrow(mat)/apply(mat, 2, id))
tfidf <- mat
for(word in names(idf)){tfidf[,word] <- tf[,word] * idf[word]}
return(tfidf)
}
第三种方法:
#creating term matrix with TF-IDF weighting
terms <-DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
第4种方法:
library(textir)
data(we8there)
## 20 high-variance tf-idf terms
colnames(we8thereCounts)[
order(-sdev(tfidf(we8thereCounts)))[1:20]]
参考:
http://ethen8181.github.io/machine-learning/clustering_old/tf_idf/tf_idf.html