统计文本中每个单词的序列和出现次数

最新推荐文章于 2021-10-19 08:41:18 发布

原创最新推荐文章于 2021-10-19 08:41:18 发布 · 2.5k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#string #null #iterator #system #insert #delete

编程珠玑专栏收录该内容

6 篇文章

订阅专栏

本文介绍三种统计文本中单词出现频率的方法：使用STL中的map进行高效统计，利用Hash表实现自定义哈希函数进行单词计数，以及通过构建Trie树进行单词存储与检索。

统计文本中每个单词的序列

使用STL

/*统计文本中出现的单词的序列*/
#include <iostream>
#include <fstream>
#include <string>
#include <set>
using namespace std;
int main()
{
	string str;
	set<string> DistinctWordSet;
	set<string>::iterator it;
	ifstream in("word.txt");
	while(in >> str)
	{
		DistinctWordSet.insert(str);
	}
	//输出
	for (it = DistinctWordSet.begin();it != DistinctWordSet.end();it++)
	{
		cout<<*it<<endl;
	}
	system("pause");
	return 1;
}

统计文本中每个单词的出现次数

STL实现

/*统计文本中每个单词的出现次数*/
#include <iostream>
#include <fstream>
#include <algorithm>
#include <map>
#include <string>
using namespace std;
int main()
{
	string str;
	map<string,int> WordCountMap;
	map<string,int>::iterator it;
	ifstream in("word.txt");//打开文件
	if (in.fail())
	{
		cout<<"打开文件错误！"<<endl;
		exit(0);
	}
	while(in >> str)
	{
		transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写
		WordCountMap[str]++;
	}
	in.close();
	//输出
	for (it = WordCountMap.begin();it != WordCountMap.end();it++)
	{
		cout<<it->first<<" "<<it->second<<endl;
	}
	system("pause");
	return 1;
}

Hash实现

/*统计文本中每个单词的出现次数*/
#include <iostream>
#include <assert.h>
#include <string>
#include <fstream>
#include <algorithm>
using namespace std;
const int NHASH = 29989;
const int MULT = 31;

class StrNode
{
public:
	string word;
	unsigned int count;
	StrNode* next;
public:
	StrNode(string str) : count(1),next(NULL),word(str){}
};

class CountStr
{
public:
	CountStr();
public:
	unsigned int HashIndex(string str);
	void InsertWord(string str);
	void InitStr(string FileName);
	void Print();
private:
	StrNode* bin[NHASH];
};

CountStr::CountStr()
{
	memset(bin,NULL,NHASH * sizeof(StrNode*));
}

/*如字符串abc的Hash值为（97 *31 + 98）  * 31  + 99*/
unsigned int CountStr::HashIndex(string str)
{
	unsigned int index = 0;
	int strLen = str.size();

	assert(strLen > 0);
	for (int i = 0;i < strLen;i++)
	{
		index = MULT * index + str.at(i);
	}
	return index % NHASH;
}

void CountStr::InsertWord(string str)
{
	StrNode* p = NULL;
	unsigned int index = HashIndex(str);
	for (StrNode* p = bin[index];p != NULL;p = p->next)
	{
		if (str == p->word)
		{
			p->count++;
			return;
		}
	}
	p = new StrNode(str);
	//使用头插法插入节点
	p->next = bin[index];
	bin[index] = p;
}

void CountStr::InitStr(string fileName)
{
	string str;
	ifstream in(fileName.c_str());
	while(in >> str)
	{
		transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写
		InsertWord(str);
	}
}

void CountStr::Print()
{
	for (int i = 0;i < NHASH;i++)
	{
		for (StrNode* p = bin[i];p;p = p->next)
		{
			cout<<p->word<<" "<<p->count<<endl;
		}
	}
}

int main()
{
	CountStr countStr;
	countStr.InitStr("word.txt");
	countStr.Print();
	system("pause");
	return 1;
}

trie树实现

#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <assert.h>
using namespace std;
const int MaxBranchNum = 26;

/*定义trie树结点*/
class TrieNode
{
public:
	char* word;
	int count;
	TrieNode* nextBranch[MaxBranchNum];
public:
	TrieNode() : word(NULL),count(0)
	{
		memset(nextBranch,NULL,sizeof(TrieNode*) * MaxBranchNum);	
	}
};

/*定义类Trie*/
class Trie
{
public:
	Trie();
	~Trie();
	void Insert(const char* str);
	void Print();
private:
	TrieNode* pRoot;
private:
	void Destory(TrieNode* pRoot);
	void Print(TrieNode* pRoot);
};

Trie::Trie()
{
	pRoot = new TrieNode();
}

Trie::~Trie()
{
	Destory(pRoot);
}

/*注意*/
void Trie::Insert(const char* str)
{
	assert(NULL != str);
	int index;
	TrieNode* pLoc = pRoot;
	for (int i = 0;str[i];i++)
	{
		index = str[i] - 'a';//如果区分大小写，可以扩展

		if(index < 0 || index > MaxBranchNum)//不执行插入
		{
			return;
		}

		if (NULL == pLoc->nextBranch[index])
		{
			pLoc->nextBranch[index] = new TrieNode();
		}
		pLoc = pLoc->nextBranch[index];
	}
	if (NULL != pLoc->word)//单词已经出现过
	{
		pLoc->count++;
		return;
	}
	else    //单词没有出现过，应该插入单词
	{
		pLoc->count++;
		pLoc->word = new char[strlen(str) + 1];
		assert(NULL != pLoc->word);
		strcpy(pLoc->word,str);
	}
}

void Trie::Print()
{
	Print(pRoot);
}

/*输出所有的单词*/
void Trie::Print(TrieNode* pRoot)
{
	if (NULL == pRoot)
	{
		return;
	}
	//输出单词
	if (NULL != pRoot->word)
	{
		if (strcmp(pRoot->word,"is") == 0)
		{
			cout<<"is"<<endl;
		}
		if (strcmp(pRoot->word,"it") == 0)
		{
			cout<<"it"<<endl;
		}
		cout<<pRoot->word<<" "<<pRoot->count<<endl;
	}
	//递归处理分支
	for (int i = 0;i < MaxBranchNum;i++)
	{
		Print(pRoot->nextBranch[i]);
	}
}

/*销毁trie树*/
void Trie::Destory(TrieNode* pRoot)
{
	if (NULL == pRoot)
	{
		return;
	}
	for (int i = 0;i < MaxBranchNum;i++)
	{
		Destory(pRoot->nextBranch[i]);
	}
	//销毁单词占得空间
	if (NULL != pRoot->word)
	{
		delete []pRoot->word;   
		pRoot->word = NULL;
	}
	delete pRoot;//销毁结点
	pRoot = NULL;
}


int main(int argc, char *argv[])
{
	string str;
	Trie t;
	ifstream in("word.txt");
	//把单词输入字典树
	while(in >> str)
	{
		transform(str.begin(),str.end(),str.begin(),::tolower);//大写变小写
		//cout<<str<<endl;
		t.Insert(str.c_str());
	}
	//输出
	t.Print();
	system("PAUSE");
	return 1;
}