转载自:http://blog.youkuaiyun.com/v_JULY_v/article/details/6370650
查找最小的k个元素(数组)
题目:输入n个整数,输出其中最小的k个。例如输入1,2,3,4,5,6,7和8这8个数字,则最小的4个数字为1,2,3和4。
采用最大堆问题虽然并不能达到时间的最优化(O(n*logk)),但是采用堆的方式存储对于海量
#include <iostream>
#include <assert.h>
using namespace std;
void MaxHeap(int heap[], int i, int len);
/*-------------------
BUILD-MIN-HEAP(A)
1 heap-size[A] ← length[A]
2 for i ← |_length[A]/2_| downto 1
3 do MAX-HEAPIFY(A, i)
*/
// 建立大根堆
void BuildHeap(int heap[], int len)
{
if (heap == NULL)
return;
int index = len / 2;
for (int i = index; i >= 1; i--)
MaxHeap(heap, i, len);
}
/*----------------------------
PARENT(i)
return |_i/2_|
LEFT(i)
return 2i
RIGHT(i)
return 2i + 1
MIN-HEAPIFY(A, i)
1 l ← LEFT(i)
2 r ← RIGHT(i)
3 if l ≤ heap-size[A] and A[l] < A[i]
4 then smallest ← l
5 else smallest ← i
6 if r ≤ heap-size[A] and A[r] < A[smallest]
7 then smallest ← r
8 if smallest ≠ i
9 then exchange A[i] <-> A[smallest]
10 MIN-HEAPIFY(A, smallest)
*/
//调整大根堆
void MaxHeap(int heap[], int i, int len)
{
int largeIndex = -1;
int left = i * 2;
int right = i * 2 + 1;
if (left <= len && heap[left] > heap[i])
largeIndex = left;
else
largeIndex = i;
if (right <= len && heap[right] > heap[largeIndex])
largeIndex = right;
if (largeIndex != i)
{
swap(heap[i], heap[largeIndex]);
MaxHeap(heap, largeIndex, len);
}
}
int _tmain(int argc, _TCHAR* argv[])
{
// 定义数组存储堆元素
int k;
cin >> k;
int *heap = new int [k+1]; //注,只需申请存储k个数的数组
FILE *fp = fopen(("D:\\data.txt"), "r"); //从文件导入海量数据(便于测试,只截取了9M的数据大小)
assert(fp);
for (int i = 1; i <= k; i++)
fscanf(fp, "%d ", &heap[i]);
BuildHeap(heap, k); //建堆
int newData;
while (fscanf(fp, "%d", &newData) != EOF)
{
if (newData < heap[1]) //如果遇到比堆顶元素kmax更小的,则更新堆
{
heap[1] = newData;
MaxHeap(heap, 1, k); //调整堆
}
}
for (int j = 1; j <= k; j++)
cout << heap[j] << " ";
cout << endl;
fclose(fp);
return 0;
}
数据处理很有意义,转载自July:
用容量为k的最大堆存储最小的k个数,此时,k1<k2<...<kmax(kmax设为大顶堆中最大元素)。遍历一次数列,n,每次遍历一个元素x,与堆顶元素比较,x<kmax,更新堆(用时logk),否则不更新堆。这样下来,总费时O(n*logk)。
为什么?道理很简单,如果要处理的序列n比较小时,思路2(选择排序)的n*k的复杂度还能说得过去,但当n很大的时候列?同时,别忘了,如果选择思路1(快速排序),还得在数组中存储n个数。当面对海量数据处理的时候列?n还能全部存放于电脑内存中么?(或许可以,或许很难)。