C++海量数据排序

最新推荐文章于 2022-04-14 22:00:00 发布
hqw19881118
最新推荐文章于 2022-04-14 22:00:00 发布
阅读量2.6k
点赞数 2
CC 4.0 BY-SA版权
文章标签： C++ 大数据 URL
本文链接：https://blog.youkuaiyun.com/hqw19881118/article/details/8997969
/************************************************************************** 
  Name: 题二,算法实现 
  Copyright: hqw
  Author: huangqingwei23@126.com
  Date: 26/05/13 00:30
  Description: refer to below 

问题描述：			
	在一个服务器系统日志文件里，包含有百亿数量级的数据，这些数据的结构是：
	每条数据占一行，包含一个url串（可能有重复），后跟一个数字（表示其被搜索的频次）,
	现在要求计算出该日志文件中频次最高的前K个url。
算法描述及复杂度分析：
	一、因数据量过大（设规模为N）只能采用分块处理的策略。具体描述如下：
	step1: 根据系统可用内存空间对原日志文件进行分块（K块），然后对每块作快速排序（按照url的字典序排序）；
	step2: 维护一个大小为K的最大堆（按照url的字典序）来对当前各分块进行合并排序并存入原日志文件；
	step3: 顺序扫描此已按url字典有序的日志，将其中重复的url合并，并累加其频次，至此，得到一个不包含重复url的大文件；
	step4: 按照step1的思路，对此文件进行分块，对每块进行快速排序（按照频次的大小排序）;
	step5: 维护一个最大堆（按照频次的大小顺序），利用step2的思想，输出前K个url即可(若K < N,则不必对所有url按频次排序)。
	二、算法的正确性及可行性（仅不考虑内存溢出），由上述步骤易知；
	三、复杂度（不妨设所有url串最长为L）按以上steps，依次累加：
	O(N + K*N/K*lg(N/k)*L) + O(K*L + N*lgK*L) + O(N*L) + O(N + K*N/K*lg(N/k)) + O(K + K*lgK)；
	考虑到N远大于K及L，故从渐近意义上来说，上述算法总的复杂度为O(N*lgN)。

算法输入：
	num_of_url：需要随机生成的url的条数
	num_per_block：分块后每个子文件中url的条数
	num_of_pre_big：需要求的频次最高的前多少条url，数目
算法输出：
	newUrls.txt，其中存放出现频次最高的前指定数目的url
****************************************************************************/
#include <iostream>
#include <fstream>
#include <ctime>
#include <string>
#include <cstring>
#include <cassert>
#include <deque>
#include <algorithm>
using namespace std;


const char *SRC_URL_FILE = "srcUrls.txt"; /*the src url file, which is generated randomly*/
const char *NEW_URL_FILE = "newUrls.txt"; /*the result url file, which stored the several biggest rate of urls*/

void logger(const char* error_msg);
template<typename T, typename Compare> void quickSort(T *array, int length, Compare cmp);

/*data structure*/
struct Url{
	string url;
	unsigned rate;
	unsigned file_belong;
	Url(string _url="", unsigned _rate=0, unsigned _file_belong=0)
		:url(_url), rate(_rate), file_belong(_file_belong){}
};

typedef bool (*Compare)(const Url &lhs, const Url &rhs);

bool compareByUrl(const Url &lhs, const Url &rhs){
	return lhs.url < rhs.url;
}

bool compareByRate(const Url &lhs, const Url &rhs){
	return lhs.rate < rhs.rate;
}
		
class DealBigData{
	public:
		DealBigData(unsigned long num_of_url=100, unsigned long num_per_block=10)
			:m_num_of_url(num_of_url), m_num_per_block(num_per_block)
		{ /*do nothing*/}
	public:
		//getters and setters
		unsigned long getNumOfUrl() const {
			return m_num_of_url;
		}
		unsigned long getNumPerBlock()const {
			return m_num_per_block;
		}
		void setNumOfUrl(unsigned long num_of_url){
			m_num_of_url = num_of_url;
		}
		void setNumPerBlock(unsigned long num_per_block){
			m_num_per_block = num_per_block;
		}
	public:
		bool init_data();
		bool SortByUrl();
		bool getMostCountUrls(const unsigned int num);
	private:
		bool reInsertUrlHeap(const unsigned int file_count, 
								FILE* *infile_ptr,
								deque<struct Url>& url_heap, 
								unsigned int& empty_count, 
								bool *has_next);
		bool splitFileWithMergedUrl(unsigned int& file_count, Compare cmp, bool is_merge = false);
		bool mergeSort(const unsigned int file_count, Compare cmp, bool is_order = true, unsigned int url_count=0);
		
	private:
		unsigned long m_num_of_url; /* the total number of urls*/
		unsigned long m_num_per_block;/* the number of urls per block*/
};

//quick sort
template<typename T, typename Compare>
int partition(T *array, int low, int high, Compare cmp)
{
	if(low < high){//randomized
		int i = low + rand()%(high - low + 1);
		T tmp = array[i];
		array[i] = array[low];
		array[low] = tmp;
	}
	
    T x = array[low];
    while(low < high){
        while(low < high && !(cmp(x, array[high]))) --high;
        array[low] = array[high];
        while(low < high && !(cmp(array[low], x))) ++low;
        array[high] = array[low];
    }
    array[low] = x;
    return low;
}
template<typename T, typename Compare>
void quick_sort(T *array, int p, int r, Compare cmp)
{
    if(p < r){
        unsigned int q = partition(array, p, r, cmp);
        quick_sort(array, p, q-1, cmp);
        quick_sort(array, q+1, r, cmp);
    }
}
template<typename T, typename Compare>
void quickSort(T *array, int length, Compare cmp)
{
    quick_sort(array, 0, length-1, cmp);
}

inline void logger(const char* error_msg){
	cerr<<error_msg
		<<" refer to: "<<__FILE__
		<<" at line: "<<__LINE__
		<<endl;
}

/*
generate the test data file randomly
*/
bool DealBigData::init_data(){

	ofstream outfile(SRC_URL_FILE, ios::out);
	if(!outfile){
		logger("create file error!");
		return false;
	}
	srand((unsigned int)time(0));
	unsigned long num_url = m_num_of_url, lenOfUrl;
	while(num_url-- > 0){
		outfile<<"http://www.";
		lenOfUrl = 5 + rand()%4;
		for(int i = 0; i < lenOfUrl; ++i){
			char ch = 97 + rand()%lenOfUrl;
			outfile<<ch;
		}
		int rate = 1 + rand()%1000;	//>=1
		outfile<<".com "<<rate;
		if(num_url > 0){
			outfile<<endl;
		}
	}
	return true;
}




/*
keep a heap to merge sort the tmp files(already sorted), return true if succed, false if faied
*/
bool DealBigData::mergeSort(const unsigned int file_count, /*number of tmp files*/
				Compare cmp, /*compare method, by url or by rate*/
				bool is_order, /*true(default): will sort all urls order by url; false: just find the pre url_count urls by rate*/
				unsigned int url_count /*(default 0)if isOrder == false, it should be assigned with a positive number*/
				)
{
	if(file_count <= 0){
		logger("error:no file! ");
		return false;
	}
	
	ofstream src_url_file;
	if(is_order){	//cover the old file;
		src_url_file.open(SRC_URL_FILE, ios::out);
	}
	else{	//put the result in a new file
		src_url_file.open(NEW_URL_FILE, ios::out);
	}
	if(!src_url_file){
		logger("src_url_file open error!");
		return false;
	}
	
	char name_of_files[50];
	//store the tmp file ptrs in an array
	FILE* *infile_ptr = new FILE*[file_count]; 
	for(int i = 0; i < file_count; ++i){
		sprintf(name_of_files, "./tmp_url_file/data%d.txt", i+1);
		infile_ptr[i] = fopen(name_of_files, "rt");
	}
	
	//sort with a max-heap
	deque<Url> url_heap(file_count, Url());
	bool *has_next = new bool[file_count]; //remeber to free the memory
    memset(has_next, true, sizeof(bool) * file_count);
	char url[30];	char rate[5];
	for(unsigned int i=0; i < file_count; ++i){
		if(fscanf(infile_ptr[i], "%s%s", url, rate) == EOF){
            has_next[i] = false; 
			continue;
		}
		url_heap[i].url = url;
		url_heap[i].rate = atoi(rate);    
		url_heap[i].file_belong = i;//just store the index of infile_ptr 
	}

	//make max-heap
	make_heap(url_heap.begin(), url_heap.end(), cmp);//first one is the max
	Url top_url = url_heap.front();
	url_heap.pop_front();
	
	unsigned int num_of_urls = 0;	/*used to count the number of urls, which are already sorted*/
    unsigned empty_count=0; 	/*used to count the number of files, which are empty*/
	
	while(empty_count < file_count){	//while all files become empty, break the iteration
		src_url_file<<top_url.url<<" "<<top_url.rate<<endl;
		++num_of_urls;

		if(!is_order && num_of_urls >= url_count){
			break;	/*just get the previous url_count biggest rate of urls*/
		}

		//position of next element, which shuold be inserted into the heap
		unsigned file_index = top_url.file_belong;
		
		//if some file is empty, then pop the heap, but not select another element from other file for url_heap
		if(file_index < file_count && !has_next[file_index]){
			if(url_heap.empty()){//should reinsert file_count elements into the url_heap
				if(false == reInsertUrlHeap(file_count, infile_ptr, url_heap, empty_count, has_next)){
					break;//if the files are empty now
				}
			}
		}
		//if the file where the top element located, is not empty, then pop and reinsert
		else if(file_index < file_count && has_next[file_index]){
			memset(url, 0, sizeof(url)); memset(rate, 0, sizeof(rate));

			//end of file
			if(fscanf(infile_ptr[file_index], "%s%s", url, rate) == EOF){
				has_next[file_index] = false;
				++empty_count;//a new empty file
				if(empty_count >= file_count){
					break;//finished
				}
				//
				if(url_heap.empty()){//should reinsert file_count elements into the url_heap
					if(false == reInsertUrlHeap(file_count, infile_ptr, url_heap, empty_count, has_next)){
						break;
					}
				}
				if(url_heap.empty()){
					logger("error!!!");
					delete[] has_next;
					return false;
				}
			}
			else{//has more than one element
				url_heap.push_back(Url(url, atoi(rate), file_index));
			}
		}
		//common codes of above
		make_heap(url_heap.begin(), url_heap.end(), cmp);
		//just pop, no reinsert
		top_url = url_heap.front();
		url_heap.pop_front();//erase the first top of heap		
	}
	
	//deal with the left elements in url_heap
	while((is_order || (!is_order && num_of_urls++ < url_count)) && !url_heap.empty()){
		make_heap(url_heap.begin(), url_heap.end(), cmp);//first one is the max
		top_url = url_heap.front();
		src_url_file<<top_url.url<<" "<<top_url.rate;
		url_heap.pop_front();
		if(!url_heap.empty()){
			src_url_file<<endl;
		}
	}
	src_url_file.close();
	//close the tmp files
	for(int i = 0; i < file_count; ++i){
		fclose(infile_ptr[i]);
	}
	//free the resources
	delete[] has_next;
	
	system("rm ./tmp_url_file/*.txt");
	system("rmdir tmp_url_file");
	return true;
}
/*
Deal with the condition when the heap is empty. Reinsert some urls from the tmp files.
Return true if the heap is inserted to full, false if all tmp files are empty
*/
bool DealBigData::reInsertUrlHeap(const unsigned int file_count, 
								FILE* *infile_ptr,
								deque<Url>& url_heap, 
								unsigned int& empty_count, 
								bool *has_next)
{
	char url[30];	char rate[5];
	unsigned u_count = 0; 

	//bool finished_flag = false;// in case the num of total element left is less then file_count
	while(u_count < file_count && empty_count < file_count){
		//scan the file sequently, get one url from every file(if not empty).
		for(int i_file_index = 0; i_file_index < file_count; ++i_file_index){
			if(has_next[i_file_index]){
				memset(url, 0, sizeof(url)); memset(rate, 0, sizeof(rate));
				if(fscanf(infile_ptr[i_file_index], "%s%s", url, rate) == EOF){
					has_next[i_file_index] = false;	
					++empty_count;
					continue;
				}
				url_heap.push_back(Url(url, atoi(rate), i_file_index));
				++u_count;
			}
		}//for
	}
	//if all tmp files are empty
	if(empty_count >= file_count){
		return false;//finished
	}
	return true;
}
/*
merge the same urls with it's rate added, and split the src_url_file into tmp files(sorted by url)
*/
bool DealBigData::splitFileWithMergedUrl(unsigned int& file_count, Compare cmp, bool is_merge)
{
	file_count = 0;

	ifstream data_file(SRC_URL_FILE, ifstream::in);
	if(!data_file){
		logger("open dataFile error!");
		return false;
	}
	//create the tmp directory
	if(system("mkdir tmp_url_file") != 0){
		system("rm ./tmp_url_file/*.txt");
	}
	char name_of_files[50];
	string url;
	unsigned int rate;
	while(true){
		unsigned int size_count = m_num_per_block;
		memset(name_of_files, 0, sizeof(name_of_files));
		sprintf(name_of_files, "./tmp_url_file/data%d.txt", ++file_count);
		
		ofstream sub_file(name_of_files, ios::out|ofstream::app);
		if(!sub_file){
			logger("open tmp files error!");
			return false;
		}
		if(is_merge){//this branch merged the same urls
			string pre_url="";
			unsigned pre_rate=0;
			while(size_count > 1 && data_file>>url>>rate){
				if(pre_url != url){
					if(pre_rate == 0){//initilized
						pre_url = url;
						pre_rate = rate;
						continue;
					}
					else{//pre_url can't be merged any more
						--size_count;
						sub_file<<pre_url<<" "<<pre_rate<<endl;

						pre_url = url;
						pre_rate = rate;
					}
				}
				else{//can be merge
					pre_rate += rate;
				}
			}
			//deal with the tail
			sub_file<<pre_url<<" "<<pre_rate;
		}
		else{// do not merge the same urls, just split the src file
			while(size_count-- > 0 && data_file>>url>>rate){
				sub_file<<url<<" "<<rate;
				if(size_count > 0){
					sub_file<<endl;
				}
			}
		}
		sub_file.close();
		
		//decreasing sort every block of file by randomized quick_sort
		ifstream infile(name_of_files, ifstream::in);
		if(!infile){
			logger("error!");
			return false;
		}
		Url *urls = new Url[m_num_per_block];
		if(!urls){
			logger("malloc memory faled for urls!");
			return false;
		}
		//count the real numbers of urls from every tmp file
		unsigned int url_count = 0;
		while(infile>>url>>rate){
			urls[url_count].url = url;
			urls[url_count].rate = rate;
			++url_count;
		}
		infile.close();

		::quickSort(urls, url_count, cmp);// order rule can be set

		//restore the sorted urls into tmp files
		fstream outfile;
		outfile.open(name_of_files, ios::out);//cover the exist files
		if(!outfile){
			logger("outfile open error!");
			delete[] urls;
			return false;
		}
		for(int i = 0; i < url_count; ++i){
			outfile<<urls[i].url<<" "<<urls[i].rate;
			if(i < url_count-1){
				outfile<<endl;
			}
		}
		outfile.close();
		//free the resources
		delete[] urls;
		//reach the end of the src_url_file
		if(data_file.eof()){
			break;
		}
	}
	return true;
}

/*
sort the urls in the src_url_file according to url
*/
bool DealBigData::SortByUrl(){
	bool flag1, flag2;
	unsigned int file_count=0;
	flag1 = splitFileWithMergedUrl(file_count, ::compareByUrl, false);
	flag2 = mergeSort(file_count, ::compareByUrl);
	return flag1 && flag2;
}

/*
get the previous num biggest rate of urls, and store them into file newUrls.txt in current directory
*/
bool DealBigData::getMostCountUrls(const unsigned int num){
	bool flag1, flag2;
	unsigned int file_count=0;
	flag1 = splitFileWithMergedUrl(file_count, ::compareByRate, true);
	flag2 = mergeSort(file_count, ::compareByRate, false, num);
	return flag1 && flag2;
}

inline double getTime(clock_t start_time){
	double finish_time;
	finish_time = clock();
	double running_time = double(finish_time - start_time);
	return running_time/CLOCKS_PER_SEC;
}

int main(int argc,char*argv[]){
	unsigned long num_of_url, num_per_block, num_of_pre_big;
	
    cout<<"Please input the totoal number of urls, number of urls per-block, number of the previous biggest:\n";
	cin >> num_of_url >> num_per_block >> num_of_pre_big;
	
	DealBigData dealBigData(num_of_url, num_per_block);
	
	//generate the test data randomly
	if(dealBigData.init_data()){
		clock_t start_time = clock();
		dealBigData.SortByUrl();
		cout<<"running_time to just sort: "<<getTime(start_time)<<" s"<<endl;
		
		dealBigData.getMostCountUrls(num_of_pre_big);
		cout<<"running_time to find the suitable urls: "<<getTime(start_time)<<" s"<<endl;
	}
	
   return 0;
}