weka中arff格式文件 需要给出特征向量才可读取。NLTK中不需要,但是可以把features转成 向量的形式表示出来。
本人代码设计能力实在太弱,折腾了一上午才调试出来可用的script 是在python下完成的。
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import csv
import re
def replaceTwoOrMore(s):
# pattern to look for three or more repetitions of any character, including 改写如 loveeeeeee的词 为 love
# newlines.
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
inpfile = open("stopwords.txt", "r") #把读入的tweets去掉 stopwords
line1 = inpfile.readline()
stopWords = []
while line1:
word1 = line1.strip()
stopWords.append(word1)
line1 = inpfile.readline()
inpfile.close()
inpfile = open("selected_features.txt", "r") #读取feature list
line2 = inpfile.readline()
selected_features = []
while line2:
word2 = line2.strip()
selected_features.append(word2)
line2 = inpfile.readline()
inpfile.close()
posWords = []
negWords = []
file1 = 'positive.csv'
file2 = 'negative.csv'
posdata = open(file1, 'rb')
negdata = open(file2, 'rb')
initial_value = 0
reader1 = csv.reader(posdata, delimiter=',', quotechar='"', escapechar='\\')
for row in reader1:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000): #和已经选取的feature list 进行比较,如果feature list中 含有 tweets中的词 标注1 否则 标注0, feature list有多长,就是多少维 #的向量 本例中为 2000
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('pos')
posWords.append(array)
#posWords = list(posWords)
reader2 = csv.reader(negdata , delimiter=',', quotechar='"', escapechar='\\')
for row in reader2:
line = []
array = [initial_value for i in range(2000)]
label = row[0]
word = (row[1]) # row[5] for 600000 data set row[1] for my own data
words = word.split()
for w in words:
w = replaceTwoOrMore(w)
w = w.strip('\'"?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
if(w in stopWords or val is None):
continue
else:
line.append(w.lower())
for i in range(0,2000):
if(selected_features[i] in line):
array[i] = "1,"
else:
array[i] = "0,"
array.append('neg')
negWords.append(array)
pos = posWords
neg = negWords
f1=open('pos_array.txt','w') #将转换完成的向量 输出
for i in pos:
k=' '.join([str(j) for j in i])
f1.write(k+"\n")
f1.close()
f2=open('neg_array.txt','w')
for i in neg:
k=' '.join([str(j) for j in i])
f2.write(k+"\n")
f2.close()
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""