网页评分预测

所用到的数据集为网页数据(HTML源码)

http://download.youkuaiyun.com/download/wz2671/10238915


主要解决思路是先提取单词,用信息增益算法进行特征选择,再用分类算法进行分类。
Learn.h
//The process to classify the HTML pages
/*
1. Use a subset of the rated pages for training the algorithm 
	and evaluate the effectiveness on the remaining rated pages.
2. From the training sets, find the most informative features, 
	record the training set as feature vectors to be used by the learning algorithm.
3. Using learning algorithm on each training set. 
	The learning algorithm create a representation for the user preferences.
4. The test data was converted to feature vectors using the features found informative on the training set.
5. The learned user preferences was used to determine weather pages in the test set would interest the user.
*/

#include <string>
using namespace std;

class Learn
{
private:
	int subject;	//target subject
	int trainNum;	//the number of training sets and test sets
	int featureNum = 120; //the number of features
	int testNum = 0;
	int **sample; //feature Matrix; row: a sample; col: a feature 
	static const string path[4];
	int *res;
	int *label;
	int dist(int*, int*);

public:
	Learn(int k1, int);
	~Learn();
	void analysis();
	void nN();
	void navieBayes();
	double Acc();
};
Learn.cpp
#include "stdafx.h"
#include "Learn.h"
#include "PageAnalysis.h"
#include <iostream>
using namespace std;

const string Learn::path[4] = { "Bands", "BioMedical", "Goats", "Sheep" };

Learn::Learn(int k1, int s)
{
	this->subject = s;
	this->trainNum = k1;
	this->sample = NULL;
	this->res = this->label = NULL;
}

Learn::~Learn()
{
	if (sample) delete[] sample;
	if (res) delete[]res;
	if (label) delete []label;
}

void Learn::analysis()
{
	//数据处理
	PageAnalysis *pa = new PageAnalysis("SW\\"+ path[this->subject], this->trainNum, featureNum);
	pa->getFile();
	label = new int[pa->totalSam()];
	testNum = pa->totalSam() - trainNum;
	res = new int[testNum];
	sample = new int *[pa->totalSam()];
	for (int i = 0; i < pa->totalSam(); i++)
	{
		sample[i] = new int[featureNum];
	}
	//提取成特征向量
	pa->transform(sample);
	pa->getRank(label);

	delete pa;
}

int Learn::dist(int *a, int *b)
{
	int sum = 0;
	for (int i = 0; i < featureNum; i++)
	{
		sum += abs(a[i] - b[i]);
	}
	return sum;
}

//用最近邻算法进行分类
void Learn::nN()
{
	int **dis = new int *[testNum];
	for (int i = 0; i < testNum; i++)
	{
		dis[i] = new int[trainNum];
	}
	 //计算测试样本与所有训练样本之间的两两距离,取最近样本的类别

	for (int i = trainNum; i < trainNum+testNum; i++)
	{
		int min = 10000, minl = -1;
		for (int j = 0; j < trainNum; j++)
		{
			dis[i-trainNum][j] = dist(sample[i], sample[j]);
			if (dis[i-trainNum][j] < min)
			{
				min = dis[i-trainNum][j];
				minl = label[j];
			}
		}
		res[i-trainNum] = minl;
	}

	delete[]dis;
}

//返回正确率
double Learn::Acc()
{
	int acc = 0;
	for (int i = trainNum; i < trainNum+testNum; i++)
	{
		if (label[i] == res[i-trainNum]) acc++;
	}
	return acc*1.0 / testNum;
}

//朴素贝叶斯分类器
void Learn::navieBayes()
{
	//先验概率,cnt表示正例数
	double proP, proN;
	int cnt = 0;
	for (int i = 0; i < trainNum; i++)
	{
		if (label[i]) cnt++;
	}
	proP = cnt*1.0 / trainNum;
	proN = (trainNum - cnt)*1.0 / trainNum;
	//为每个属性估计条件概率
	//其中p[0][i]表示反例中单词i出现的次数;p[1][i]表示正例中单词i出现的次数
	int *p[2] = { new int[featureNum], new int[featureNum] };
	//memset(p, 0 , sizeof(p));
	//初始化 不知道memset为什么不能用
	for (int i = 0; i < 2; i++)
	{
		for (int j = 0; j < featureNum; j++)
		{
			p[i][j] = 0;
		}
	}
	for (int i = 0; i < trainNum; i++)
	{
		for (int j = 0; j < featureNum; j++)
		{
			if(sample[i][j]) p[label[i]][j]++;
		}
	}
	//对测试样本进行预测
	//p0:测试样本为反例概率,p1:测试样本为正例概率
	
	for (int i = 0; i < testNum; i++)
	{
		double p0 = proN, p1 = proP;
		for (int j = 0; j<featureNum; j++)
		{
			//样本i特征j有还是无 ? 有包含的反例数 : 无包含的反例数 / 总反例数
			p0 *= sample[trainNum + i][j] ? p[0][i] : (trainNum - cnt - p[0][i]) / (trainNum - cnt)*1.0;
			p1 *= sample[trainNum + i][j] ? p[1][i] : (trainNum - cnt - p[1][i]) / cnt*1.0;
		}
		res[i] = p0>p1 ? p0 : p1;
	}

	delete p[1],p[0];
}
PageAnalysis.h
//把网页化做一个特征向量
/*
step1: 统计HTML中的词
step2: 统计概率
step3: 计算信息增益,筛选特征 
step4: 构造特征矩阵
*/
#include <string>
#include <iostream>
#include <vector>
#include <map>
using namespace std;

class PageAnalysis
{
private:
	//训练样本个数,测试样本个数,假设是按文件顺序排的
	int trainSampleNum, testSampleNum;
	//预期想要的特征数
	int featureNum;
	//网页存放路径
	string path; 
	//存放路径下网页名称
	vector<string> fileName;
	//存放用户评级 hot 3,medium 2,cold 1
	vector<int> rank;
	//把网页中词汇提取出来,一行代表一个网页中的内容
	vector<string> *pageWord;
	//经过筛选后的特征
	vector<string> feature;

	struct gain
	{
		string word;
		double ent;
	}*wordGain;

	double getTotalEnt();
	inline double Ent(double , double);
	static bool cmp(gain a, gain b);
	void wordSeg();
	void featureExt();
	
public:

	PageAnalysis(string path, int tsn, int fn);
	~PageAnalysis();

	void getFile();
	//n中为最终特征
	void transform(int **&n);
	int totalSam(){ return fileName.size(); };
	void getRank(int *n)
	{
		for (int i = 0; i < rank.size(); i++)
			n[i] = rank[i];
	};
};

PageAnalysis.cpp
#include "stdafx.h"
#include "PageAnalysis.h"
#include <io.h>
#include <fstream>
#include <algorithm>
#include <cmath>



PageAnalysis::PageAnalysis(string path, int tsn, int fn)
{
	this->path = path;
//	feature = new vector<string>[featureNum];
	this->featureNum = fn;
	this->trainSampleNum = tsn;
}

PageAnalysis::~PageAnalysis()
{
	if (pageWord) delete[]pageWord;
//	if (feature) delete[]feature;
	if (wordGain) delete[]wordGain;
}

//获取文件名称
void PageAnalysis::getFile()
{
	/*
	//用来存储文件各种信息的结构体
	_finddata_t file; 
	//文件句柄
	long lf = 0;
	//_findfirst: 文件名,_finddata_t的指针, 用"*.*"来查找所有文件
	if ((lf = _findfirst(path.c_str(), &file)) == -1)
	{
		cout << path << "not found!" << endl;
	}
	else
	{
		//查找成功返回0,失败返回-1
		while (_findnext(lf, &file) == 0)
		{
			if (strcmp(file.name, ".") == 0 || strcmp(file.name, "..") == 0) continue;
			fileName.push_back(file.name);
		}
	}
	_findclose(lf);
	*/

	//直接通过index文件获取文件名
	ifstream fin(this->path + "\\index");
	const int LINE_LENGTH = 500;
	char str[LINE_LENGTH];
	while (!fin.eof())
	{
		fin.getline(str, LINE_LENGTH);
		//提取名字和评级
		char tmp[80];
		int j = 0;
		if (str[0] == '\0' || str[0]==' '||str[0]=='\t') continue;
		for (int i = 0; i < 2; i++)
		{
			int k = 0;
			while (*(str+j) != '|')
			{
				tmp[k++] = *(str + j++);
			}
			tmp[k++] = '\0';
			j++;
			if (i)
			{
				if (!strcmp(tmp, "cold")) rank.push_back(0);
				else rank.push_back(1);
			}
			else this->fileName.push_back(tmp);
		}

	}
	fin.close();
	testSampleNum = fileName.size() - trainSampleNum;
	//对样本处理一下,进行分层采样,保证正反例个数相近
	int pN = 0, nN = 0, cN=0;
	for (int i = 0; i < trainSampleNum; i++)
	{
		if (rank[i] && pN < trainSampleNum / 2) { pN++; continue; }
		if (!rank[i] && nN < trainSampleNum / 2) { nN++; continue; }
		for (int j = trainSampleNum+cN; j < fileName.size(); j++)
		{
			cN++;
			if (rank[i] == rank[j]) continue;
			string tmp = fileName[i]; fileName[i] = fileName[j]; fileName[j] = tmp;
			int r = rank[i]; rank[i] = rank[j]; rank[j] = r;
			break;
		}
	}
	return;
}

//网页数据预处理,提取其中有效文本
void PageAnalysis::wordSeg()
{
	//对pageWord初始化
	pageWord = new vector<string>[fileName.size()];
	const int LINE_LENGTH = 10000;
	char str[LINE_LENGTH];
	//获取所有sample标签之间的内容 <>...</>
	for (int i = 0; i < trainSampleNum+testSampleNum; i++)
	{
		ifstream fin(this->path + "\\" + fileName[i]);

		bool end = true; //标签内容结尾
//		int cnt = 0;
		while (!fin.eof())
		{
//			cnt++;
			char word[80];//存放单词
			int k = 0;
			fin.getline(str, LINE_LENGTH);
			//把标签中的内容过滤掉<...>
			/*
			//这儿有个玄学bug,用getline 会把很多很多行读到一行之内,我就把字符串开大点,不管了
			if (i == 15 && cnt > 64)
			{
				cnt = 76;
			}
			*/
			for (int j = 0; j < strlen(str); j++)
			{
				char c = str[j];
				if (!end && c != '>') continue;
				else if (c == '<')
				{
					end = false;
				}
				else if (c == '>')
				{
					end = true;
					continue;
				}


				//有效字符,保存
				if (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '\'')
				{
					word[k++] = c;
					//本行末尾
					if (j == strlen(str) - 1)
					{
						word[k++] = '\0';
						pageWord[i].push_back(_strupr(word));
						k = 0;
					}
					continue;
				}
				//有效词汇,存起来
				else if (k > 0)
				{
					word[k++] = '\0';
					pageWord[i].push_back(_strupr(word));
					k = 0;
					continue;
				}
			}
		}
		fin.close();

		//对pageWord进行排序
		sort(pageWord[i].begin(), pageWord[i].end());
		//剔除重复元素
		pageWord[i].erase(unique(pageWord[i].begin(), pageWord[i].end()), pageWord[i].end());
		if (i == 0)
		{
			ofstream out("1.txt");
			if (out.is_open())
			{
				for (int j = 0; j < pageWord[i].size(); j++)
				{
					out << pageWord[i][j] + " ";
				}
			}
			out.close();
		}
	}
}

bool PageAnalysis::cmp(gain a, gain b)
{
	return a.ent>b.ent;
}

//信息熵,因为就两类,所以直接求
inline double PageAnalysis::Ent(double p1, double p2)
{
	if (p1 > 1e-6)
	{
		if (p2 > 1e-6) return -(p1*(log2(p1)) + p2*log2(p2));
		else return -p1*(log2(p1));
	}
	else
	{
		if (p2 > 1e-6) return -p2*log2(p2);
	}
	return 0;
}

//特征提取
/*
算法思想:
计算每个词汇的信息增益
选取前k个信息增益最大的词汇
(决策树中还需要对划分后分支节点继续划分,这儿应该没必要了吧)
*/
void PageAnalysis::featureExt()
{
	//计算总信息熵 也就是统计下正例反例占的比例
	int positive = 0, negative = 0;
	double totalEnt =0;
	double p[2];
	for (int i = 0; i < trainSampleNum; i++)
	{
		if (rank[i]) positive++;
		else negative++;
	}
	totalEnt = Ent(positive*1.0 / trainSampleNum, negative*1.0 / trainSampleNum);

	//该map用来存放所有word,int存储正例数和反例数
	map<string, int> word;
	//迭代器,用于访问map
	map<string, int>::iterator iter;
	int label = 0;
	
	for (int i = 0; i < trainSampleNum; i++)
	{
		label = rank[i];
		for (int j = 0; j < pageWord[i].size(); j++)
		{
			iter = word.find(pageWord[i][j]);
			//word中已经有这个词了,直接计算label
			if (iter != word.end())
			{
				//一个整形32位,用高16位存反例个数,低16位存正例个数
				int t = iter->second;
				if (label) word[pageWord[i][j]]++;
				else word[pageWord[i][j]] += 1 << 16;
			}
			//word中还没有这个词,插入该词
			else
			{
				if (!label) label = 1 << 16;
				word.insert(pair<string, int>(pageWord[i][j], label));
			}
		}
	}
	//构建一个gain数组,存放对应的词和信息增益
	wordGain = new gain[word.size()];
	int i = 0;
	for (iter = word.begin(); iter != word.end(); iter++)
	{
		int pNum = iter->second % (1 << 16);
		int nNum = iter->second / (1 << 16);
		int sum = pNum + nNum;
		wordGain[i].word = iter->first;
		wordGain[i++].ent = totalEnt - ((sum*1.0 / trainSampleNum)*Ent(pNum*1.0 / sum, nNum*1.0 / sum)
			+ ((trainSampleNum - sum)*1.0 / trainSampleNum)*Ent((positive - pNum)*1.0 / (trainSampleNum - sum), (negative - nNum)*1.0 / (trainSampleNum - sum)));
	}

	sort(wordGain, wordGain + word.size(), cmp);
	//选取前k
//	ofstream out(path+to_string(trainSampleNum)+".txt");
//	if (out.is_open())
	{
		for (int i = 0; i < featureNum; i++)
		{
			feature.push_back(wordGain[i].word);
	//		out << wordGain[i].word + "\n";
		}
	}
//	out.close();
}

//把样本用特诊表示为一个特征向量,单词有为1,无为0,n用于存放结果,传给其他函数
void PageAnalysis::transform(int ** &n)
{
	wordSeg();
	featureExt();
	//思路:因为向量都已经排好序,直接二分查找特征是否在对应的vector中
	for (int i = 0; i < trainSampleNum+testSampleNum; i++)
	{
		for (int j = 0; j < featureNum; j++)
		{
			//不用二分了,麻烦
			//n[i][j] = binary_search(pageWord[i].begin(), pageWord[i].end(), feature[j]);
			if (pageWord[i][0]>feature[j]) { n[i][j] = 0; continue; }
			if (pageWord[i][pageWord[i].size()-1] < feature[j]) { n[i][j] = 0; continue; }
			if (pageWord[i][pageWord[i].size()-1] == feature[j]) {n[i][j] = 1; continue; }
			for (int k = 0; k < pageWord[i].size()-1; k++)
			{
				if (pageWord[i][k] == feature[j]) 
					n[i][j] = 1;
				if (pageWord[i][k]<feature[j] && pageWord[i][k+1] > feature[j]) 
					n[i][j] = 0;
			}
		}
	}
	/*
	ofstream out("res.txt");
	if (out.is_open())
	{
		for (int i = 0; i < trainSampleNum + testSampleNum; i++)
		{
			for (int j = 0; j < featureNum; j++)
			{

				out << n[i][j]<<' ';

			}
			out << '\n';
		}

	}
	out.close();
	*/
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值