#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring>
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>
using namespace std;
struct train_data {
int index; //训练文本序号
int emotion_value; //情感值
string emotion; //情感状态
vector<string> word; //训练文本单词
int onehot[1000]; //onehot矩阵中的值
double distance; //距离
train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
index = a;
emotion_value = b;
emotion = c;
distance = d;
word.clear();
for (int i = 0; i < 1000; i ++)
onehot[i] = 0;
}
};
struct ct {
string s; //情感状态
int num; //次数
ct(string a = "", int b = 0) {
s = a;
num = b;
}
};
vector<string> train_text; //每个完整的训练文本
vector<string> all_words; //所有不同的单词 ,纵轴
vector<train_data> all_trains; //所有训练文本,横轴
int right_sum; //预测正确的个数
void reading_file(void );
void get_onehot(void );
void class_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );
bool cmp2(const ct & , const ct & );
int main() {
int k;
for (k = 1; k < 15; k ++) {
train_text.clear();
all_words.clear();
all_trains.clear();
reading_file();
get_onehot();
cout << "input k = " << k << endl;
cout << "不重复的词个数 " << all_words.size() << endl;
class_calculating(k);
break;
}
return 0;
}
void reading_file() {
ifstream train("train.txt");
char read[100];
string temp;
train.getline(read, 100);
while (!train.eof()) {
train.getline(read, 100);
temp = read;
train_text.push_back(temp);
}
train.close();
stringstream s;
int index;
int emotion_value;
string emotion;
string word;
for (int i = 0; i < train_text.size(); i ++) {
s.str(train_text[i]);
s >> index;
s >> emotion_value;
s >> emotion;
//创建一个新的训练文本数据
train_data new_train;
new_train.index = index;
new_train.emotion_value = emotion_value;
new_train.emotion = emotion;
while (s != NULL) {
s >> word;
//统计所有单词
bool flag1 = true;
for (int i = 0; i < all_words.size(); i ++) {
if (all_words[i] == word) {
flag1 = false;
break;
}
else
continue;
}
if (flag1)
all_words.push_back(word);
//统计每个训练文本中的单词
bool flag2 = true;
for (int i = 0; i < new_train.word.size(); i ++) {
if (new_train.word[i] == word) {
flag2 = false;
break;
}
else
continue;
}
if (flag2)
new_train.word.push_back(word);
}
s.clear();
all_trains.push_back(new_train);
}
//test
/*
ofstream t("test.txt");
for (int i = 0; i < all_trains.size(); i ++) {
t << all_trains[i].index << " " << all_trains[i].emotion_value << " " << all_trains[i].emotion;
for (int j = 0; j < all_trains[i].word.size(); j ++)
t << " " << all_trains[i].word[j];
t << endl;
}
for (int i = 0; i < all_words.size(); i ++)
cout << i + 1 << " " << all_words[i] << endl;
*/
}
void get_onehot() {
int i, j, k;
for (i = 0; i < all_trains.size(); i ++) {
for (j = 0; j < all_trains[i].word.size(); j ++) {
for (k = 0; k < all_words.size(); k ++) {
if (all_trains[i].word[j] == all_words[k])
all_trains[i].onehot[k] = 1;
}
}
}
//cout << "hot" <<endl;
//test
/*
ofstream s("testonehot.txt");
for (i = 0; i < all_words.size(); i ++)
s << setw(14) << left << all_words[i];
s << endl;
for (j = 0; j < all_trains.size(); j ++) {
for (k = 0; k < all_words.size(); k ++)
s << setw(14) << left << all_trains[j].onehot[k];
s << endl;
}
*/
}
void class_calculating(int k) {
ifstream t("test.txt");
right_sum = 0;
char c[100];
string temp;
t.getline(c, 100);
while (t.getline(c, 100)) {
train_data test_train;
char *p = strtok(c, " ");
p = strtok(NULL, " ");
p = strtok(NULL, " ");
temp = p;
test_train.emotion = temp;
//cout << temp << endl;
p = strtok(NULL, " ");
while (p != NULL) {
temp = p;
bool flag = true;
for (int i = 0; i < test_train.word.size(); i ++) {
if (test_train.word[i] == temp) {
flag = false;
break;
}
}
if (flag)
test_train.word.push_back(temp);
p = strtok(NULL, " ");
}
double d1 = 0;
for (int i = 0; i < test_train.word.size(); i ++) {
bool flag3 = true;
for (int j = 0; j < all_words.size(); j ++) {
if (test_train.word[i] == all_words[j]) {
test_train.onehot[j] = 1;
flag3 = false;
break;
}
else
continue;
}
if (flag3) { //如果训练样本中没有这个单词,但又不能改变原始样本
d1 ++;
}
}
for (int i = 0; i < all_trains.size(); i ++) {
//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
double part1 = 0;
for (int j = 0; j < all_words.size(); j ++)
part1 += all_trains[i].onehot[j] * test_train.onehot[j];
all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
}
sort(all_trains.begin(), all_trains.end(), cmp);
vector<ct> v;
ct node("anger", 0);
v.push_back(node);
ct node1("disgust", 0);
v.push_back(node1);
ct node2("fear", 0);
v.push_back(node2);
ct node3("joy", 0);
v.push_back(node3);
ct node4("sad", 0);
v.push_back(node4);
ct node5("surprise", 0);
v.push_back(node5);
for (int i = 0; i < k; i ++) {
for (int j = 0; j < v.size(); j ++) {
if (all_trains[i].emotion == v[j].s) {
v[j].num ++;
break;
}
}
}
sort(v.begin(), v.end(), cmp2);
if (v.back().s == test_train.emotion) {
right_sum ++;
}
}
cout << "正确个数" << right_sum << endl;
}
double edistance(train_data a, train_data b) { //欧式 :开方前
double total = 0.0;
for (int i = 0; i < all_words.size(); i ++) {
total += pow(a.onehot[i] - b.onehot[i], 2);
}
return total;
}
double mdistance(train_data a, train_data b) { //曼哈顿
double total = 0.0;
for (int i = 0; i < all_words.size(); i ++) {
total += abs(a.onehot[i] - b.onehot[i]);
}
return total;
}
bool cmp(const train_data &a, const train_data &b) {
return a.distance < b.distance;
}
bool cmp2(const ct &a, const ct &b ) {
return a.num < b.num;
}
KNN回归:夹角余弦
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstring>
#include <string>
#include <vector>
#include <cmath>
#include <time.h>
#include <map>
using namespace std;
struct train_data {
int index; //训练文本序号
int emotion_value; //情感值
string emotion; //情感状态
vector<string> word; //训练文本单词
int onehot[1000]; //onehot矩阵中的值
double distance; //距离
vector<double> fre_set;
train_data(int a = 0, int b = 0, string c = "", double d = 0.0) {
index = a;
emotion_value = b;
emotion = c;
distance = d;
word.clear();
fre_set.clear();
for (int i = 0; i < 1000; i ++)
onehot[i] = 0;
}
};
struct ct {
string s;
int num;
ct(string a = "", int b = 0) {
s = a;
num = b;
}
};
vector<string> train_text; //每个完整的训练文本
vector<string> all_words; //所有不同的单词 ,纵轴
vector<train_data> all_trains; //所有训练文本,横轴
int right_sum; //预测正确的个数
void reading_file(void );
void get_onehot(void );
void regre_calculating(int );
double edistance(train_data , train_data );
double mdistance(train_data , train_data );
bool cmp(const train_data & , const train_data & );
int main() {
train_text.clear();
all_words.clear();
all_trains.clear();
reading_file();
//cout << all_words.size() << endl; 904
get_onehot();
int k;
cout << "input k = ";
cin >> k;
regre_calculating(k);
//for (int i = 0; i <= all_trains[2].onehot.size(); i ++)
// cout << all_trains[2].onehot[i] << endl;
return 0;
}
void reading_file() {
ifstream t("Dataset_train.csv");
char c[150];
string temp;
t.getline(c, 150);
while (t.getline(c, 150)) {
train_data new_train;
char d[150];
strcpy(d, c);
char *p = strtok(c, ",");
p = strtok(NULL, ",");
//cout << p << endl;
char *p2 = strtok(p, " ");
while (p2 != NULL) {
string word = p2;
//统计所有单词
bool flag1 = true;
for (int i = 0; i < all_words.size(); i ++) {
if (all_words[i] == word) {
flag1 = false;
break;
}
else
continue;
}
if (flag1)
all_words.push_back(word);
//统计每个训练文本中的单词
bool flag2 = true;
for (int i = 0; i < new_train.word.size(); i ++) {
if (new_train.word[i] == word) {
flag2 = false;
break;
}
else
continue;
}
if (flag2)
new_train.word.push_back(word);
p2 = strtok(NULL, " ");
}
char *p3 = strtok(d, ",");
p3 = strtok(NULL, ",");
p3 = strtok(NULL, ",");
stringstream ss;
double fre;
while (p3 != NULL) {
temp = p3;
ss.str(temp);
ss >> fre;
new_train.fre_set.push_back(fre);
ss.clear();
p3 = strtok(NULL, ",");
}
/*for (int i = 0; i < new_train.word.size(); i ++)
cout << new_train.word[i] << " ";
for (int i = 0; i < new_train.fre_set.size(); i ++)
cout << new_train.fre_set[i] << " ";
break;
*/
all_trains.push_back(new_train);
}
//cout << all_words.size();
}
void get_onehot() {
int i, j, k;
for (i = 0; i < all_trains.size(); i ++) {
for (j = 0; j < all_trains[i].word.size(); j ++) {
for (k = 0; k < all_words.size(); k ++) {
if (all_trains[i].word[j] == all_words[k])
all_trains[i].onehot[k] = 1;
}
}
}
}
void regre_calculating(int k) {
ifstream t("Dataset_validation.csv");
char c[150];
string temp;
t.getline(c, 150);
ofstream out("14353324_xiangketing_regression.txt");
while (t.getline(c, 150)) {
train_data test_train;
char *p = strtok(c, ",");
p = strtok(NULL, ",");
//cout << p << endl;
char *p2 = strtok(p, " ");
while (p2 != NULL) {
temp = p2;
bool flag = true;
for (int i = 0; i < test_train.word.size(); i ++) {
if (test_train.word[i] == temp) {
flag = false;
break;
}
}
if (flag)
test_train.word.push_back(temp);
p2 = strtok(NULL, " ");
}
double d1 = 0;
for (int i = 0; i < test_train.word.size(); i ++) {
bool flag3 = true;
for (int j = 0; j < all_words.size(); j ++) {
if (test_train.word[i] == all_words[j]) {
test_train.onehot[j] = 1;
flag3 = false;
break;
}
else
continue;
}
if (flag3) { //如果训练样本中没有这个单词,但又不能改变原始样本
d1 ++;
}
}
for (int i = 0; i < all_trains.size(); i ++) {
//all_trains[i].distance = sqrt(d1 + edistance(all_trains[i], test_train));
double part1 = 0;
for (int j = 0; j < all_words.size(); j ++)
part1 += all_trains[i].onehot[j] * test_train.onehot[j];
all_trains[i].distance = part1/(sqrt(all_trains[i].word.size())*sqrt(test_train.word.size() + d1));
}
sort(all_trains.begin(), all_trains.end(), cmp);
double a[6];
double sum = 0;
for (int i = 0; i < 6; i ++) {
double value = 0;
for (int j = 0; j < k; j ++) {
value += all_trains[j].fre_set[i] * all_trains[j].distance;
}
a[i] = value;
sum += value;
}
out << a[0] / sum << '\t' << a[1] / sum << '\t' << a[2] / sum << '\t'
<< a[3] / sum << '\t' << a[4] / sum << '\t' << a[5] / sum << endl;
}
}
double edistance(train_data a, train_data b) {
double total = 0.0;
for (int i = 0; i < all_words.size(); i ++) {
total += pow(a.onehot[i] - b.onehot[i], 2);
}
return total;
}
double mdistance(train_data a, train_data b) {
double total = 0.0;
for (int i = 0; i < all_words.size(); i ++) {
total += abs(a.onehot[i] - b.onehot[i]);
}
return total;
}
bool cmp(const train_data &a, const train_data &b) {
return a.distance > b.distance;
}