本人菜鸟,对天池数据写点小理解。天池数据不同于一般的评分数据,有几点比较主要:天池数据存在同用户对同物品的不同操作,这和SVD相背,所以数据处理这块非常重要。又根据啊里的背景来看,会发现数据存在这样一个内部关系。购买的物品必然受到点击到购物车,那么这个时候购物车的分析意义有多大?值得商榷。同时那么多的点击次数是因为对找个商品有购买欲望?还是因为购买过来看看价格波动?同时淘宝的数据存在这样一个问题,例如我买的小零食可以出现多次购买,但是购买例如电脑、冰箱这种大物件基本存在二次购买的几率比较小。而数据经过加密处理,这种情况怎么考虑?等等。其这篇文章主要还是一贯作风,贴代码,留个纪念。效果不是很好,F1只有6.4.仅当纪念。
#include <iostream> #include <cstring> #include <fstream> #include <map> #include <vector> using namespace std; void load(string); void train(vector<int>,vector<int>,vector<int>,int); void ComM(vector<int>,vector<int>,vector<int>); void ComR(vector<int>,vector<int>,vector<int>); void TopN(vector<map<vector<int>,double> >); double W1 = 1; double W2 = 1; map<vector<int>,double> predictScore(int,vector<int>,vector<int>,int); vector<int> U,I,R; vector<int> MinRc,MaxRc,MaxRv,MinRv,MaxRp,MinRp,MaxRg,MinRg; map<vector<int>,int> RCui,RVui,RPui,RGui; vector<map<vector<int>,double> > result; int step = 1; int main() { string trains_file = "/home/ja/CADATA/ALI/data/New/trains_tmp.dat"; load(trains_file); ComM(U,I,R); string train_file = "/home/ja/CADATA/ALI/data/New/trains.dat"; load(train_file); ComR(U,I,R); for(size_t i=0;i<step;i++){ train(U,I,R,W1); } U.clear(); I.clear(); R.clear(); RCui.clear(); RVui.clear(); RPui.clear(); RGui.clear(); string train_file_1 = "/home/ja/CADATA/ALI/data/New/test_s.dat"; load(train_file_1); ComR(U,I,R); for(size_t i=0;i<step;i++){ train(U,I,R,W2); } TopN(result); return 0; } void TopN(vector<map<vector<int>,double> > r){ ofstream out("/home/ja/CADATA/ALI/data/New/result/result.dat"); for(size_t i=0;i<r.size();i++){ for(map<vector<int>,double>::iterator it=r[i].begin();it!=r[i].end();++it){ out << it->first[0] << "\t" << it->first[1] << "\t" << it->second << endl; } } } map<vector<int>,double> predictScore(int user,vector<int> item,vector<int> rating,int index,int w){ map<vector<int>,double> TMPc,TMPv,TMPp,TMPg,Tmpresult; for(map<vector<int>,int>::iterator it=RCui.begin();it!=RCui.end();++it){ if(it->first[0] == index){ double t = w * it->second / (MaxRc[index] * 1.0); //double t = it->second / (MaxRc[index] * 1.0); vector<int> tmp; tmp.push_back(index); tmp.push_back(it->first[1]); TMPc[tmp] += t; //cout << it->second<< "--" << MaxRc[index]<< "--" << t <<endl; } } for(map<vector<int>,int>::iterator it=RVui.begin();it!=RVui.end();++it){ if(it->first[0] == index){ double t = w * it->second / (MaxRv[index]* 1.0); vector<int> tmp; tmp.push_back(index); tmp.push_back(it->first[1]); TMPv[tmp] += t; } } for(map<vector<int>,int>::iterator it=RPui.begin();it!=RPui.end();++it){ if(it->first[0] == index){ double t = w * it->second / (MaxRp[index] * 1.0); //double t = it->second / (MaxRp[index] * 1.0); vector<int> tmp; tmp.push_back(index); tmp.push_back(it->first[1]); TMPp[tmp] += t; } } for(map<vector<int>,int>::iterator it=RGui.begin();it!=RGui.end();++it){ if(it->first[0] == index){ double t = w * it->second / (MaxRg[index] * 1.0); //double t = it->second / (MaxRg[index] * 1.0); vector<int> tmp; tmp.push_back(index); tmp.push_back(it->first[1]); TMPg[tmp] += t; } } map<vector<int>,double> TmpVv,TmpPp,TmpGg,TmpPpp,TmpGgg,TmpGggg; for(map<vector<int>,double>::iterator it=TMPc.begin();it!=TMPc.end();++it){ double score = it->second; for(map<vector<int>,double>::iterator itt=TMPv.begin();itt!=TMPv.end();++itt){ if(it->first[0] == itt->first[0] ){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpVv[tmp] = itt->second; score += 0; } } } for(map<vector<int>,double>::iterator itt=TMPp.begin();itt!=TMPp.end();++itt){ if(it->first[0] == itt->first[0] ){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpPp[tmp] = itt->second; score += 0; } } } for(map<vector<int>,double>::iterator itt=TMPg.begin();itt!=TMPg.end();++itt){ if(it->first[0] == itt->first[0] ){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpGg[tmp] = itt->second; score += 0; } } } vector<int> tmp; tmp.push_back(it->first[0]); tmp.push_back(it->first[1]); Tmpresult[tmp] += score; } double size = Tmpresult.size(); if(TmpVv.size() != 0){ for(map<vector<int>,double >::iterator it = TmpVv.begin();it!=TmpVv.end();++it){ double score = it->second; for(map<vector<int>,double>::iterator itt=TmpPp.begin();itt!=TmpPp.end();++itt){ if(it->first[0] == itt->first[0]){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpPpp[tmp] = itt->second; score += 0; } } } for(map<vector<int>,double>::iterator itt=TmpGg.begin();itt!=TmpGg.end();++itt){ if(it->first[0] == itt->first[0] ){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpGgg[tmp] = itt->second; score += 0; } } } vector<int> tmp; tmp.push_back(it->first[0]); tmp.push_back(it->first[1]); Tmpresult[tmp] += score; } } if(TmpPp.size() != 0){ for(map<vector<int>,double >::iterator it = TmpPpp.begin();it!=TmpPpp.end();++it){ double score = it->second; for(map<vector<int>,double>::iterator itt=TmpGgg.begin();itt!=TmpGgg.end();++itt){ if(it->first[0] == itt->first[0] ){ if(it->first[1] == itt->first[1]){ score += itt->second; } else{ vector<int> tmp; tmp.push_back(itt->first[0]); tmp.push_back(itt->first[1]); TmpGggg[tmp] = itt->second; score += 0; } } } vector<int> tmp; tmp.push_back(it->first[0]); tmp.push_back(it->first[1]); Tmpresult[tmp] += score; } } if(TmpGggg.size() != 0){ for(map<vector<int>,double >::iterator it = TmpGggg.begin();it!=TmpGggg.end();++it){ double score = it->second; vector<int> tmp; tmp.push_back(it->first[0]); tmp.push_back(it->first[1]); Tmpresult[tmp] += score; } } /* if(size != result.size()){ cout << (size - result.size()) << endl; } */ return Tmpresult; } void train(vector<int> User,vector<int> Item,vector<int> Rating,int W){ map<vector<int>,double> Score; //for(size_t i=0;i<User.size();i++){ for(size_t i=0;i<884;i++){ Score = predictScore(User[i],Item,Rating,i,W); result.push_back(Score); } } void ComR(vector<int> User,vector<int> Item,vector<int> Rating){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 0){ if(User[i] == User[i+1] && Item[i] == Item[i+1]){ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RCui[tmp] += 1; } else{ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RCui[tmp] += 1; } } if(i == User.size()-2){ vector<int> tmp; tmp.push_back(User[i+1]); tmp.push_back(Item[i+1]); RCui[tmp] += 1; } } for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 1){ if(User[i] == User[i+1] && Item[i] == Item[i+1]){ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RVui[tmp] += 1; } else{ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RVui[tmp] += 1; } } } for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 2){ if(User[i] == User[i+1] && Item[i] == Item[i+1]){ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RPui[tmp] += 1; } else{ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RPui[tmp] += 1; } } } for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 3){ if(User[i] == User[i+1] && Item[i] == Item[i+1]){ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RGui[tmp] += 1; } else{ vector<int> tmp; tmp.push_back(User[i]); tmp.push_back(Item[i]); RGui[tmp] += 1; } } } } void ComM(vector<int> User,vector<int> Item,vector<int> Rating){ int max = 0; int num = 1; int min = 1000; /* for(size_t un=0;un<884;un++){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 0 && User[i] == un){ if(User[i] == User[i+1] && Rating[i] == Rating[i+1] && Item[i] == Item[i+1]){ num += 1; if(num > max){ max = num; } } else{ if(num > max){ max = num; } num = 1; } } } MaxRc.push_back(max); num = 1; max = 0; } */ for(size_t un=0;un<884;un++){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 0 && User[i] == un){ if(User[i] == User[i+1] && Rating[i] == Rating[i+1] && Item[i] == Item[i+1]){ num += 1; if(num > max){ max = num; } } else{ if(num > max){ max = num; } num = 1; } } } MaxRc.push_back(max); num = 1; max = 0; } for(size_t un=0;un<884;un++){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 1 && User[i] == un){ //cout << i << " " << Rating[i] << " " << un << endl; if(User[i] == User[i+1] && Rating[i] == Rating[i+1] && Item[i] == Item[i+1]){ num += 1; if(num > max){ max = num; } } else{ num = 1; if(num > max){ max = num; } } } } MaxRv.push_back(max); num = 1; max = 0; } for(size_t un=0;un<884;un++){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 2 && User[i] == un){ //cout << i << " " << Rating[i] << " " << un << endl; if(User[i] == User[i+1] && Rating[i] == Rating[i+1] && Item[i] == Item[i+1]){ num += 1; if(num > max){ max = num; } } else{ num = 1; if(num > max){ max = num; } } } } MaxRp.push_back(max); num = 1; max = 0; } for(size_t un=0;un<884;un++){ for(size_t i=0;i<User.size()-1;i++){ if(Rating[i] == 3 && User[i] == un){ //cout << i << " " << Rating[i] << " " << un << endl; if(User[i] == User[i+1] && Rating[i] == Rating[i+1] && Item[i] == Item[i+1]){ num += 1; if(num > max){ max = num; } } else{ num = 1; if(num > max){ max = num; } } } } MaxRg.push_back(max); num = 1; max = 0; } } void load(string file){ ifstream fin(file.c_str()); if(!fin){ cout << "error for fileName" << endl; } int userId,itemId,rating; while(fin >> userId >> itemId >> rating){ U.push_back(userId); I.push_back(itemId); R.push_back(rating); } fin.close(); }