统计文章中的单词的词频,可以使用C++中Map来实现。
1.统计从标准输入输入的单词的词频。
#include <iostream>
#include <map>
#include <string>
using namespace std;
int main(int argc, char *argv[]) {
string s;
map<string, int> counter;
while(cin >> s) {
if(s[0] == 'y') break;
++counter[s];
}
map<string, int>::const_iterator it;
for(it = counter.begin(); it != counter.end(); ++it) {
cout << it->first << " : " << it->second << endl;
}
}
2. 统计一篇文章中出现次数最少和最多的单词以及其词频。
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cctype>
#include <vector>
using namespace std;
bool isSeperator(int c) {
return c == ',' || c == '.' || c == ' ' || c == '\t' || c == '\n' || c == '\r';
}
void printWords(vector<string> &words) {
vector<string>::const_iterator it;
for(it = words.begin(); it != words.end(); ++it) {
cout << *it << endl;
}
}
int main(int argc, char *argv[]) {
map<string, int> counter;
ifstream in("D:\\huawei\\Test1\\Debug\\test.txt");
char buffer[100];
int c;
int i = 0;
while((c = in.get()) != EOF ){
if(/*isSeperator(c) || ispunct(c)*/ !isalnum(c) ){
if(i == 0 ) continue;
buffer[i] = '\0';
string word(buffer);
++counter[word];
i = 0;
} else {
buffer[i++] = c;
}
}
in.close();
map<string, int>::const_iterator it;
map<string, int>::iterator;
it = counter.begin();
string maxWord = it->first;
int max = it->second;
string minWord = it->first;
int min = it->second;
cout << "statistics for first max , min frequecy word" << endl;
for(; it != counter.end(); ++it) {
//cout << it->first << ":" << it->second << endl;
if(it->second > max) {
maxWord = it->first;
max = it->second;
}else if( it->second < min) {
minWord = it->first;
min = it->second;
}
}
cout << " max frequece word: " << maxWord << ", frequency: " << max <<endl;
cout << " min frequece word: " << minWord << ", frequency: " << min <<endl;
cout << "statistics for all max , min frequecy word" << endl;
vector<string > maxWords;
vector<string > minWords;
it = counter.begin();
maxWords.push_back(it->first);
max = it->second;
minWords.push_back(it->first);
min = it->second;
for(++it; it != counter.end(); ++it) {
//cout << it->first << ":" << it->second << endl;
if(it->second > max) {
maxWords.clear();
maxWords.push_back(it->first);
max = it->second;
}else if(it->second == max) {
maxWords.push_back(it->first);
}
if(it->second < min) {
minWords.clear();
minWords.push_back(it->first);
min = it->second;
}else if(it->second == min) {
minWords.push_back(it->first);
}
}
cout << endl << endl;
cout << "the max frequency word(s) in the text is(are) : " << endl;
printWords(maxWords);
cout << "their frenqency is " << max << endl;
cout << endl << endl;
cout << "the min frequency word(s) in the text is(are) : " << endl;
printWords(minWords);
cout << "their frenqency is " << min << endl;
return 0;
}
总之,都是基于hash的一种计数思想。
3.map统计string的频率:
/* 请在这里实现下列函数, c/c++语法不限, 最后需要保证程序编译连接通过, 并生成test.exe文件. */
/* 相关宏定义及函数声明见'func.h'头文件 */
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cstring>
#include <memory>
#include <string>
#include <vector>
#include <map>
using namespace std;
void getSubStrings(const char *str, vector<string> &v) {
assert(str != NULL);
int len = strlen(str);
int i;
for(i = 0; i < len; ++i){
string s(str+i);
v.push_back(s);
}
}
/* 请按照要求实现下列函数 */
char* findSameSubStr(const char *pIn[], int n)
{
int i;
int j;
const int len = n;
typedef vector<string> vs_type;
vs_type vs[n];
map<string, int> counter;
for(i = 0; i < n; ++i) {
getSubStrings(pIn[i], vs[i]);
}
for(i = 0; i < n; ++i) {
for(j = 0; j < vs[i].size(); ++j) {
++counter[vs[i][j]];
}
}
int maxLen = 0;
string maxStr;
map<string, int >::const_iterator it;
for(it = counter.begin(); it != counter.end(); ++it) {
if(it->second == n) {
if(maxLen == 0) {
maxStr = it->first;
maxLen = maxStr.length();
}
if((it->first).length() > maxLen) {
maxStr = it->first;
maxLen = maxStr.length();
}
}
}
char *commStr = (char *) malloc(maxLen + 1);
strncpy(commStr, maxStr.c_str(), maxLen);
commStr[maxLen] = '\0' ;
return commStr;
}
/* mian函数已经隐藏,这里保留给用户的测试入口,在这里测试你的实现函数,可以调用printf打印输出*/
/* 当前你可以使用其他方法测试,只要保证最终程序能正确执行即可 */
/* 该函数实现可以任意修改,但是不要改变函数原型。一定要保证编译运行不受影响*/
void main()
{
/* TODO: 请测试时改变改用例 */
const char * strs[] = {
"what is local bus?",
"Name some local bus.",
"local bus is high speed I/O bus close to the processor.",
};
/* TODO: 调用被测函数 */
char* commstr = findSameSubStr(strs, sizeof(strs) / sizeof(strs[0]));
/* TODO: 执行完成后可比较是否是你认为正确的值 */
if(commstr){
printf("%s\n", commstr);
free(commstr);
}
}
3.统计单词出现的行:
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <iterator>
using namespace std;
vector<string>
split(const string & s);
map<string, vector<int> > xref(istream & in, vector<string> find_words(const string & ) = split);
int main()
{
#if 0
string s;
map<string, int> counters; // store eache word and an associated counter
// read the input, keeping track of each word and how often we see it
while(cin>>s) {
++ counters[s];
}
// output the words and associated counts
for(map<string, int>::const_iterator i = counters.begin(); i != counters.end(); ++i) {
cout << i->first << "\t" << i->second << endl;
}
#endif
#if 1
// call xref using split by default
map<string, vector<int> > ret = xref(cin);
// output result
for (map<string, vector<int> >::const_iterator i = ret.begin(); i != ret.end(); ++i) {
// output the word
cout << "word: " << i->first << " :occurs on line(s) : ";
// followed by line numbers
vector<int>::const_iterator j = i->second.begin();
cout << *j ;
++j;
for(; j < i->second.end(); ++j) {
cout << ", " << *j;
}
cout << ".";
//output a new line to separate each word from the next
cout << endl;
}
#endif
return 0;
}
vector<string> split(const string &s) {
vector<string> ret;
typedef string::size_type string_size;
string_size i = 0;
//invariant: we have processed characters [ original value of i, i)
while(i != s.size()) {
// ignore leading blanks
//invariant: characters in range [original i, current i) are all spaces
while(i != s.size() && isspace(s[i]) ) ++i;
//find end of the world
string_size j = i;
//invariant: none of the characters in range of [original j, current j) is a space
while(j != s.size() && !isspace(s[j]) ) ++j;
//if we found some nonwhitespace characeters, namely word
if(i != j) {
//copy from str starting at i and taking j-i chars
ret.push_back(s.substr(i, j-i));
i = j;
}
}
return ret;
}
// find all the lines that refer to each word in the input
map<string, vector<int> > xref(istream &in, vector<string> find_words(const string &)){
string line;
int line_number = 0;
map<string, vector<int> > ret;
// read the next line
while(getline(in, line)) {
++line_number;
// break the input line into words
vector<string> words = find_words(line);
// remember that each word occurs on the current line
for(vector<string>::const_iterator i = words.begin(); i != words.end(); ++i) {
ret[*i].push_back(line_number);
}
}
return ret;
}
5.url相关:
#include <iostream>
#include <cctype>
#include <string>
#include <vector>
#include <iterator>
#include <algorithm>
#include <numeric>
using namespace std;
//分解字符串
vector<string> split_version1(const string & str);
bool space(char c);
bool not_space(char c);
vector<string> split(const string & s);
//回文
bool is_palindrome(const string & s);
//查找URLs
bool not_url_char(char c);
string::const_iterator url_begin(string::const_iterator b, string::const_iterator e);
string::const_iterator url_end(string::const_iterator b, string::const_iterator e);
vector<string> find_urls(const string & s);
int main()
{
#if 1
string s;
// read and split each line of input
while(getline(cin, s) ) {
vector<string> v = split(s);
//output each word in s
for(vector<string>::size_type i=0; i != v.size(); ++i) {
cout << v[i] << endl;
}
cout << endl << endl;
string str = accumulate(v.begin(), v.end(), string(""));
cout << str << endl;
// output the urls in the string.
vector<string> urls = find_urls(s);
cout << "urls in such string: " << endl;
for(vector<string>::const_iterator i=urls.begin(); i != urls.end(); ++i) {
cout << *i << endl;
}
}
#endif
#if 0
vector<int> u(10, 100);
vector<int> vec;
std::ostream_iterator<int, char> out(cout, " ");
copy(u.begin(), u.end(), out);
cout << endl;
copy(u.begin(), u.end(), back_inserter(vec));
copy(vec.begin(), vec.end(), out);
cout << endl;
copy(u.begin(), u.end(), back_insert_iterator< vector<int > >(vec) );
copy(vec.begin(), vec.end(), out);
cout << endl;
/*
typedef vector<int>::const_iterator iter;
for(iter i = vec.begin(); i < vec.end(); ++i) {
cout << *i << " ";
}
cout << endl;
*/
#endif
return 0;
}
vector<string> split_version1(const string & s) {
vector<string> ret;
typedef string::size_type string_size;
string_size i = 0;
//invariant: we have processed characters [ original value of i, i)
while(i != s.size()) {
// ignore leading blanks
//invariant: characters in range [original i, current i) are all spaces
while(i != s.size() && isspace(s[i]) ) ++i;
//find end of the world
string_size j = i;
//invariant: none of the characters in range of [original j, current j) is a space
while(j != s.size() && !isspace(s[j]) ) ++j;
//if we found some nowhitespace characeters, namely word
if(i != j) {
//copy from str starting at i and taking j-i chars
ret.push_back(s.substr(i, j-i));
i = j;
}
}
return ret;
}
// true if the argument is whitespace, false otherwise
bool space(char c) {
return isspace(c);
}
// false if the argument is whitespace, ture otherwise
bool not_space(char c) {
return !isspace(c);
}
vector<string> split(const string & s) {
typedef string::const_iterator iter;
vector<string> ret;
iter i = s.begin();
while(i != s.end()) {
// ignore leading blanks
i = find_if(i, s.end(), not_space);
//find next blank
iter j = find_if(i, s.end(), space);
// copy the character in [i, j)
if(i != s.end()) {
ret.push_back(string(i, j));
}
i = j;
}
return ret;
}
//回文
bool is_palindrome(const string & s) {
return equal(s.begin(), s.end(), s.rbegin() );
}
//查找 URLs
bool not_url_char(char c) {
//characters, in addition to alphanumerics, that can appear in a URL
static const string url_ch = "~;/?:@=&$-_.+!*%'(),";
// see whether c can appear in a URL and return the negative
return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end());
}
string::const_iterator url_begin(string::const_iterator b, string::const_iterator e) {
static const string sep = "://";
typedef string::const_iterator iter;
// i marks where the separator was found
iter i = b;
while( (i = search(i, e, sep.begin(), sep.end())) != e) {
// make sure the separator isn't at beginning or end of the line
if(i != b && i + sep.size() != e) {
// begin marks the beginning of the protocol-name
iter begin = i;
while(begin != b && isalpha(begin[-1])) --begin;
// is there at least one appropriate character before and after the separator?
if(begin != i && !not_url_char(i[sep.size()])) return begin;
}
// the separator we found wasn't part of a URL; advance i past this separator
i += sep.size();
}
return e;
}
string::const_iterator url_end(string::const_iterator b, string::const_iterator e) {
return find_if(b, e, not_url_char);
}
// URL ::= protocol-name:// resource-name
vector<string> find_urls(const string & s) {
vector<string> ret;
typedef string::const_iterator iter;
iter b = s.begin();
iter e = s.end();
//look through the entire input
while(b != e) {
// look for one or more letters followed by ://
b = url_begin(b, e);
//if we found it
if(b != e) {
// get the rest of the url
iter after = url_end(b, e);
// remember the URL
ret.push_back(string(b, after));
//advance b and check for more URLs in s
b = after;
}
}
return ret;
}