统计数集中出现最多的N个数（topK）

最新推荐文章于 2024-01-16 15:23:48 发布

jinnlxl

最新推荐文章于 2024-01-16 15:23:48 发布

阅读量841

点赞数

CC 4.0 BY-SA版权

分类专栏： algorithms

本文链接：https://blog.youkuaiyun.com/yejing_utopia/article/details/44159707

algorithms 专栏收录该内容

39 篇文章

订阅专栏

本文详细介绍了如何利用堆和哈希表技术来高效处理大量数据，找出出现频率最高的N个数据项。通过精心设计的数据结构和算法优化，实现大数据集的快速分析。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

/******************************************************************
 * file:topk.c
 * brief:使用heap和hash_table统计超大数据量的数据中出现最多的N个数据
 * yejing@2015.3.1    1.0      creat
 ******************************************************************/  
  
 #include <stdio.h>
 #include <stdlib.h>
 #inlcude <string.h>
 #include <assert.h>
 
 #define parent(i)           (i)/2
 #define left_child(i)       ((i) << 1)
 #define right_child(i)      ((i) << 1) + 1
 
 
 #define HASH_TBL_SIZE       4194304
 #define HEAP_SIZE           32
 
 typedef hash_key_t   int
 typedef heap_data_t  int
 
 typedef struct _hash_node_t{
	hash_key_t key;
	int num;
	struct hash_node_t* next;
 }hash_node_t;
 static hash_node_t* hash_tbl[HASH_TBL_SIZE];
 
 typedef struct _min_heap_t{
	heap_data_t    data;
	int            num;
 }min_heap_t, heap[HEAP_SIZE + 1]
 
 int hash(hash_key_t key){
	return HASH_TBL_SIZEW%key;
 }
 
 hash_node_t* alloc_a_node(hash_key_t key){
	hash_node_t* node = (hash_node_t *)malloc(sizeof(node));
	if(!node)
		return NULL;
		
	node->key  = key;
	node->next = NULL;
	node->num  = 1;
	
	return node;
 }
 
 void node_insert(hash_key_t key){
	int tmp = hash(key);
	hash_node_t* node = hash_tbl[tmp];
	
	while(node){
		if(node->key == key){
			pnode->num++;
			return;
		}
		node = node->next;
	}
	
	if(!(node = creat_node(key)))
		assert(0);
	node->next = hash_tbl[tmp];
	hash_tbl[tmp] = node;
	return;
 }
 
 void heap_min_heaprify(min_heap_t min_heap, int size, int cursor){
	int left  = left_child(i);
	int right = right_child(i);
	int miniman;
	
	if(left < size && min_heap[left] < min_heap[cursor])
		miniman = left;
	else
		miniman = cusor;
	
	if(right < size && min_heap[right] < min_heap[minimam])
		miniman = right;
		
	if(miniman != cursor){
		swap(&min_heap[miniman], &min_heap[cursor]);
		heap_min_heaprify(min_heap, size, miniman);
	}
	
	return;
 }
 
 void build_min_heap(min_heap_t min_heap, int size){
	int i = 0;
	
	for(i = size/2; i >= 1; --i){
		heap_min_heaprify(min_heap, size, i);
	}
	
	return;
 }
 
 void topk_main(){
	hash_node_t* tmp = NULL;
	int i = 0;
	
	for(i = 0; i < HASH_TBL_SIZE; ++i){
		tmp = hash_table[i];
		if(tmp > min_heap[i].num){
			heap[1].num  = tmp->num;
			heap[1].data = tmp->key;
			heap_min_heaprify(heap, HEAP_SIZE, 1);
		}
		tmp = tmp->next;
	}
	
	return;
 }