统计文章中词的词频

最新推荐文章于 2022-02-12 11:07:18 发布

wzb56

最新推荐文章于 2022-02-12 11:07:18 发布

阅读量1.7k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： C/C++ 经典算法

本文链接：https://blog.youkuaiyun.com/wzb56_earl/article/details/8774194

C/C++ 同时被 2 个专栏收录

97 篇文章

订阅专栏

经典算法

15 篇文章

订阅专栏

统计文章中的单词的词频，可以使用C++中Map来实现。

1.统计从标准输入输入的单词的词频。

#include <iostream>
#include <map>
#include <string>

using namespace std;

int main(int argc, char *argv[]) {	 
	string s;
	map<string, int> counter;
	while(cin >> s) {
		if(s[0] == 'y') break;
		++counter[s];
	}

    map<string, int>::const_iterator it; 	 
	for(it = counter.begin(); it != counter.end(); ++it) {
		cout << it->first  << " : " << it->second << endl;
	}



	
}

2. 统计一篇文章中出现次数最少和最多的单词以及其词频。

#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cctype>
#include <vector>



using namespace std;

bool isSeperator(int c) {
	return c == ',' || c == '.' || c == ' ' || c == '\t' || c == '\n' || c == '\r'; 
}

void printWords(vector<string> &words) {
	vector<string>::const_iterator it;
	for(it = words.begin(); it != words.end(); ++it) {
		cout << *it << endl;
	}
}

int main(int argc, char *argv[]) {
	
	map<string, int> counter;

	ifstream in("D:\\huawei\\Test1\\Debug\\test.txt");

	char buffer[100];
	int c;
	int i = 0;
	while((c = in.get()) != EOF ){
		if(/*isSeperator(c) || ispunct(c)*/ !isalnum(c) ){
			if(i == 0 ) continue;

			buffer[i] = '\0';
			string word(buffer);
			++counter[word];
			i = 0;

		} else {
			buffer[i++] = c;
		}
	}
    in.close();

	map<string, int>::const_iterator it;
	map<string, int>::iterator;
	it = counter.begin();
	string maxWord = it->first;
	int max = it->second;
	string minWord = it->first;
	int min = it->second;


	

	cout << "statistics for first max , min frequecy word" << endl;
	for(; it != counter.end(); ++it) {
		//cout << it->first << ":" << it->second << endl;
		if(it->second > max) {
			maxWord = it->first;
			max = it->second;
		}else if( it->second < min) {
			minWord = it->first;
			min = it->second;
		}
	}
		
	cout << " max frequece word: " << maxWord << ", frequency: " << max <<endl;
	cout << " min frequece word: " << minWord << ", frequency: " << min <<endl;


	cout << "statistics for all max , min frequecy word" << endl;
	
	
	vector<string > maxWords;
	vector<string > minWords;

	it = counter.begin();
	maxWords.push_back(it->first);
	max = it->second;

	minWords.push_back(it->first);
	min = it->second;    
	for(++it; it != counter.end(); ++it) {
		//cout << it->first << ":" << it->second << endl;
		if(it->second > max) {
			maxWords.clear();
			maxWords.push_back(it->first);
			max = it->second;
		}else if(it->second == max) {
			maxWords.push_back(it->first);
		}

		if(it->second < min) {
			minWords.clear();
			minWords.push_back(it->first);
			min = it->second;
		}else if(it->second == min) {
			minWords.push_back(it->first);
		}
		
	}
	    
    cout << endl << endl;
	cout << "the max frequency word(s) in the text is(are) : " << endl;
	printWords(maxWords);
    cout << "their frenqency is " << max << endl;

   
    
    cout << endl << endl;
	cout << "the min frequency word(s) in the text is(are) : " << endl;
	printWords(minWords);
    cout << "their frenqency is " << min << endl;



	return 0;
}

总之，都是基于hash的一种计数思想。

3.map统计string的频率：

/* 请在这里实现下列函数, c/c++语法不限, 最后需要保证程序编译连接通过, 并生成test.exe文件. */
/* 相关宏定义及函数声明见'func.h'头文件 */


#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cstring>

#include <memory>
#include <string>
#include <vector>
#include <map>
using namespace std;


void getSubStrings(const char *str, vector<string> &v) {
	assert(str != NULL);

	int len = strlen(str);
	int i;
	for(i = 0; i < len; ++i){
		string s(str+i);
		v.push_back(s);
	}	
}

/* 请按照要求实现下列函数 */

char* findSameSubStr(const char *pIn[], int n)
{
	
	int i;
	int j;
	const int len = n;
	typedef vector<string> vs_type;
	vs_type vs[n];
	map<string, int> counter;
	for(i = 0; i < n; ++i) {	 
		getSubStrings(pIn[i], vs[i]);
    }

	for(i = 0; i < n; ++i) {
		for(j = 0; j < vs[i].size(); ++j) {
			++counter[vs[i][j]];
		}		
	}
	

	int maxLen = 0;
	string maxStr;
	
	map<string, int >::const_iterator it;
	for(it = counter.begin(); it != counter.end(); ++it) {
		if(it->second == n) {
			if(maxLen == 0) {
				maxStr = it->first;
				maxLen = maxStr.length();
			} 
			if((it->first).length() > maxLen) {
				maxStr = it->first;
				maxLen = maxStr.length();
			}
		}
	}

     char *commStr = (char *) malloc(maxLen + 1);
	 strncpy(commStr, maxStr.c_str(), maxLen);
	 commStr[maxLen] = '\0' ;
	 


    return commStr;
}


/* mian函数已经隐藏，这里保留给用户的测试入口，在这里测试你的实现函数，可以调用printf打印输出*/
/* 当前你可以使用其他方法测试，只要保证最终程序能正确执行即可 */
/* 该函数实现可以任意修改，但是不要改变函数原型。一定要保证编译运行不受影响*/
void main()
{
    /* TODO: 请测试时改变改用例 */
   const char * strs[] = {
					"what is local bus?",
					"Name some local bus.",
					"local bus is high speed I/O bus close to the processor.",
					};

    /* TODO: 调用被测函数 */

	char* commstr = findSameSubStr(strs, sizeof(strs) / sizeof(strs[0]));


    /* TODO: 执行完成后可比较是否是你认为正确的值 */

	if(commstr){
		printf("%s\n", commstr);
		

		free(commstr);
	}


    
}

3.统计单词出现的行：

#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <iterator>

using namespace std;

vector<string>
split(const string & s);

map<string, vector<int> > xref(istream & in, vector<string> find_words(const string & ) = split);

int main()
{

#if 0
    string s;
    map<string, int> counters; // store eache word and an associated counter

    // read the input, keeping track of each word and how often we see it
    while(cin>>s) {
        ++ counters[s];
    }

    // output the words and associated counts
    for(map<string, int>::const_iterator i = counters.begin(); i != counters.end(); ++i) {
        cout << i->first << "\t" << i->second << endl;
    }
#endif

#if 1
    // call xref using split by default
    map<string, vector<int> > ret = xref(cin);

    // output result
    for (map<string, vector<int> >::const_iterator i = ret.begin(); i != ret.end(); ++i) {
        // output the word
        cout << "word: " << i->first << " :occurs on line(s) : ";
        // followed by line numbers
        vector<int>::const_iterator j = i->second.begin();
        cout << *j ;
        ++j;
        for(; j < i->second.end(); ++j) {
            cout << ", " << *j;
        }
        cout << ".";

        //output a new line to separate each word from the next
        cout << endl;

    }
#endif

    return 0;
}


vector<string> split(const string &s) {
  vector<string> ret;
  typedef string::size_type  string_size;
  string_size i = 0;

  //invariant: we have processed characters [ original value of i, i)
  while(i != s.size()) {
    // ignore leading blanks
    //invariant: characters in range [original i, current i) are all spaces
    while(i != s.size() && isspace(s[i]) ) ++i;

    //find end of the world
    string_size j = i;
    //invariant: none of the characters in range of [original j, current j) is a space
    while(j != s.size() && !isspace(s[j]) ) ++j;

    //if we found some nonwhitespace characeters, namely word
    if(i != j)  {
        //copy from str starting at i and taking j-i chars
        ret.push_back(s.substr(i, j-i));
        i = j;
    }
  }

    return ret;
}



// find all the lines that refer to each word in the input
map<string, vector<int> > xref(istream &in,  vector<string> find_words(const string &)){
    string line;
    int line_number = 0;
    map<string, vector<int> > ret;

    // read the next line
    while(getline(in, line)) {
        ++line_number;
        // break the input line into words
        vector<string> words = find_words(line);
        // remember that each word occurs on the current line
        for(vector<string>::const_iterator i = words.begin(); i != words.end(); ++i) {
            ret[*i].push_back(line_number);
        }
    }

    return ret;
}

5.url相关：

#include <iostream>
#include <cctype>
#include <string>
#include <vector>
#include <iterator>
#include <algorithm>
#include <numeric>

using namespace std;

//分解字符串
vector<string> split_version1(const string & str);
bool space(char c);
bool not_space(char c);
vector<string> split(const string & s);

//回文
bool is_palindrome(const string & s);

//查找URLs
bool not_url_char(char c);
string::const_iterator url_begin(string::const_iterator b, string::const_iterator e);
string::const_iterator url_end(string::const_iterator b, string::const_iterator e);
vector<string> find_urls(const string & s);

int main()
{

#if 1
    string s;
    // read and split each line of input
    while(getline(cin, s) ) {
        vector<string> v = split(s);
        //output each word in s
        for(vector<string>::size_type i=0; i != v.size(); ++i) {
            cout << v[i] << endl;
        }
        cout << endl << endl;

        string str = accumulate(v.begin(), v.end(), string(""));
        cout << str << endl;



        // output the urls in  the string.
        vector<string> urls = find_urls(s);
        cout << "urls in such string: " << endl;

        for(vector<string>::const_iterator i=urls.begin(); i != urls.end(); ++i) {
            cout << *i << endl;
        }

    }
#endif

#if 0
    vector<int> u(10, 100);
    vector<int> vec;
    std::ostream_iterator<int, char> out(cout, " ");

    copy(u.begin(), u.end(), out);
    cout << endl;

    copy(u.begin(), u.end(), back_inserter(vec));
    copy(vec.begin(), vec.end(), out);
    cout << endl;

    copy(u.begin(), u.end(), back_insert_iterator< vector<int > >(vec) );
    copy(vec.begin(), vec.end(), out);
    cout << endl;


/*
    typedef vector<int>::const_iterator iter;
    for(iter i = vec.begin(); i < vec.end(); ++i) {
        cout << *i << " ";
    }
    cout << endl;
*/



#endif

    return 0;
}


vector<string> split_version1(const string & s) {
  vector<string> ret;
  typedef string::size_type  string_size;
  string_size i = 0;

  //invariant: we have processed characters [ original value of i, i)
  while(i != s.size()) {
    // ignore leading blanks
    //invariant: characters in range [original i, current i) are all spaces
    while(i != s.size() && isspace(s[i]) ) ++i;

    //find end of the world
    string_size j = i;
    //invariant: none of the characters in range of [original j, current j) is a space
    while(j != s.size() && !isspace(s[j]) ) ++j;

    //if we found some nowhitespace characeters, namely word
    if(i != j)  {
        //copy from str starting at i and taking j-i chars
        ret.push_back(s.substr(i, j-i));
        i = j;
    }
  }

    return ret;
}


// true if the argument is whitespace, false otherwise
bool space(char c) {
    return isspace(c);
}

// false if the argument is whitespace, ture otherwise
bool not_space(char c) {
    return !isspace(c);
}

vector<string> split(const string & s) {
    typedef string::const_iterator iter;
    vector<string> ret;

    iter i = s.begin();

    while(i != s.end()) {
        // ignore leading blanks
        i = find_if(i, s.end(), not_space);

        //find next blank
        iter j = find_if(i, s.end(), space);

        // copy the character in [i, j)
        if(i != s.end()) {
            ret.push_back(string(i, j));
        }

        i = j;
    }

    return ret;
}

//回文
bool is_palindrome(const string & s) {
    return equal(s.begin(), s.end(), s.rbegin() );
}

//查找 URLs
bool not_url_char(char c) {
    //characters,  in addition to alphanumerics, that can appear in a URL
    static const string url_ch = "~;/?:@=&$-_.+!*%'(),";

    // see whether c can appear in a URL and return the negative
    return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end());

}

string::const_iterator url_begin(string::const_iterator b, string::const_iterator e) {
    static const string sep = "://";
    typedef string::const_iterator iter;

    // i marks where the separator was found
    iter i = b;

    while( (i = search(i, e, sep.begin(), sep.end())) != e) {
        // make sure the separator isn't at beginning or end of the line
        if(i != b && i + sep.size() !=  e) {
            // begin marks the beginning of the protocol-name
            iter begin = i;
            while(begin != b && isalpha(begin[-1])) --begin;

            // is there at least one appropriate character before and after the separator?
            if(begin != i && !not_url_char(i[sep.size()])) return begin;
        }

        // the separator we found wasn't part of a URL; advance i past this separator
        i += sep.size();
    }

    return e;

}

string::const_iterator url_end(string::const_iterator b, string::const_iterator e) {
    return find_if(b, e, not_url_char);
}


// URL ::= protocol-name:// resource-name
vector<string> find_urls(const string & s) {
    vector<string> ret;
    typedef string::const_iterator iter;
    iter b = s.begin();
    iter e = s.end();

    //look through the entire input
    while(b != e) {
        // look for one or more letters followed by ://
        b = url_begin(b, e);

        //if we found it
        if(b != e) {
            // get the rest of the url
            iter after = url_end(b, e);

            // remember the URL
            ret.push_back(string(b, after));

            //advance b and check for more URLs in s
            b = after;
        }

    }

    return ret;
}