海量数据处理之外排序

最新推荐文章于 2024-07-17 08:04:18 发布

转载最新推荐文章于 2024-07-17 08:04:18 发布 · 761 阅读

文章标签：

#file #fp #merge #磁盘 #html #编程

海量数据处理专栏收录该内容

12 篇文章

订阅专栏

前言：

本文是对July博文http://blog.youkuaiyun.com/v_JULY_v/article/details/6451990的一些总结

现在先让我们来看一道有关外排序的题：

问题描述：
输入：一个最多含有n个不重复的正整数（也就是说可能含有少于n个不重复正整数）的文件，其中每个数都小于等于n，且n=10^7。
输出：得到按从小到大升序排列的包含所有输入的整数的列表。
条件：最多有大约1MB的内存空间可用，但磁盘空间足够。且要求运行时间在5分钟以下，10秒为最佳结果。

本题有很多种解法，列出一下几种：

分析：下面咱们来一步一步的解决这个问题，
1、归并排序。你可能会想到把磁盘文件进行归并排序，但题目要求你只有1MB的内存空间可用，所以，归并排序这个方法不行。
2、位图方案。熟悉位图的朋友可能会想到用位图来表示这个文件集合。例如正如编程珠玑一书上所述，用一个20位长的字符串来表示一个所有元素都小于20的简单的非负整数集合，边框用如下字符串来表示集合{1,2,3,5,8,13}：

0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0

上述集合中各数对应的位置则置1，没有对应的数的位置则置0。

参考编程珠玑一书上的位图方案，针对我们的10^7个数据量的磁盘文件排序问题，我们可以这么考虑，由于每个7位十进制整数表示一个小于1000万的整数。我们可以使用一个具有1000万个位的字符串来表示这个文件，其中，当且仅当整数i在文件中存在时，第i位为1。采取这个位图的方案是因为我们面对的这个问题的特殊性：1、输入数据限制在相对较小的范围内，2、数据没有重复，3、其中的每条记录都是单一的整数，没有任何其它与之关联的数据。
所以，此问题用位图的方案分为以下三步进行解决：

第一步，将所有的位都置为0，从而将集合初始化为空。
第二步，通过读入文件中的每个整数来建立集合，将每个对应的位都置为1。
第三步，检验每一位，如果该位为1，就输出对应的整数。

经过以上三步后，产生有序的输出文件。令n为位图向量中的位数（本例中为1000 0000），程序可以用伪代码表示如下：

[cpp] view plain copy

//磁盘文件排序位图方案的伪代码
//copyright@ Jon Bentley
//July、updated，2011.05.29。
//第一步，将所有的位都初始化为0
for i ={0,....n}
bit[i]=0;
//第二步，通过读入文件中的每个整数来建立集合，将每个对应的位都置为1。
for each i in the input file
bit[i]=1;
//第三步，检验每一位，如果该位为1，就输出对应的整数。
for i={0...n}
if bit[i]==1
write i on the output file

[cpp] view plain copy

//磁盘文件排序位图方案的伪代码
//copyright@ Jon Bentley
//July、updated，2011.05.29。
//第一步，将所有的位都初始化为0
for i ={0,....n}
bit[i]=0;
//第二步，通过读入文件中的每个整数来建立集合，将每个对应的位都置为1。
for each i in the input file
bit[i]=1;
//第三步，检验每一位，如果该位为1，就输出对应的整数。
for i={0...n}
if bit[i]==1
write i on the output file

上面只是为了简单介绍下位图算法的伪代码之抽象级描述。显然，咱们面对的问题，可不是这么简单。下面，我们试着针对这个要分两趟给磁盘文件排序的具体问题编写完整代码，如下。

3、多路归并。

1、内存排序
由于要求的可用内存为1MB，那么每次可以在内存中对250K的数据进行排序，然后将有序的数写入硬盘。
那么10M的数据需要循环40次，最终产生40个有序的文件。
2、归并排序

将每个文件最开始的数读入(由于有序，所以为该文件最小数)，存放在一个大小为40的first_data数组中；
选择first_data数组中最小的数min_data，及其对应的文件索引index；
将first_data数组中最小的数写入文件result，然后更新数组first_data(根据index读取该文件下一个数代替min_data)；
判断是否所有数据都读取完毕，否则返回2。

所以，本程序按顺序分两步，第一步、Memory Sort，第二步、Merge Sort。程序的流程图，如下图所示（感谢F的绘制）。

然后，编写的完整代码如下：

首先我们将生成大数据量（1000万）的程序如下：

[html] view plain copy

01.//purpose: 生成随机的不重复的测试数据
02.//copyright@ 2011.04.19 yansha
03.//1000w数据量，要保证生成不重复的数据量，一般的程序没有做到。
04.//但，本程序做到了。
05.//July、2010.05.30。
06.#include <iostream>
07.#include <time.h>
08.#include <assert.h>
09.using namespace std;
10.
11.const int size = 10000000;
12.int num[size];
13.
14.int main()
15.{
16. int n;
17. FILE *fp = fopen("data.txt", "w");
18. assert(fp);
19.
20. for (n = 1; n <= size; n++)
21. //之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0，特此订正。
22. num[n] = n;
23. srand((unsigned)time(NULL));
24. int i, j;
25.
26. for (n = 0; n < size; n++)
27. {
28. i = (rand() * RAND_MAX + rand()) % 10000000;
29. j = (rand() * RAND_MAX + rand()) % 10000000;
30. swap(num[i], num[j]);
31. }
32.
33. for (n = 0; n < size; n++)
34. fprintf(fp, "%d ", num[n]);
35. fclose(fp);
36. return 0;
37.}

[html] view plain copy

01.//copyright@ yansha
02.//July、updated，2011.05.28。
03.#include <iostream>
04.#include <string>
05.#include <algorithm>
06.#include <time.h>
07.using namespace std;
08.
09.int sort_num = 10000000;
10.int memory_size = 250000;
11.
12.//每次只对250k个小数据量进行排序
13.int read_data(FILE *fp, int *space)
14.{
15. int index = 0;
16. while (index < memory_size && fscanf(fp, "%d ", &space[index]) != EOF)
17. index++;
18. return index;
19.}
20.
21.void write_data(FILE *fp, int *space, int num)
22.{
23. int index = 0;
24. while (index < num)
25. {
26. fprintf(fp, "%d ", space[index]);
27. index++;
28. }
29.}
30.
31.// check the file pointer whether valid or not.
32.void check_fp(FILE *fp)
33.{
34. if (fp == NULL)
35. {
36. cout << "The file pointer is invalid!" << endl;
37. exit(1);
38. }
39.}
40.
41.int compare(const void *first_num, const void *second_num)
42.{
43. return *(int *)first_num - *(int *)second_num;
44.}
45.
46.string new_file_name(int n)
47.{
48. char file_name[20];
49. sprintf(file_name, "data%d.txt", n);
50. return file_name;
51.}
52.
53.int memory_sort()
54.{
55. // open the target file.
56. FILE *fp_in_file = fopen("data.txt", "r");
57. check_fp(fp_in_file);
58. int counter = 0;
59. while (true)
60. {
61. // allocate space to store data read from file.
62. int *space = new int[memory_size];
63. int num = read_data(fp_in_file, space);
64. // the memory sort have finished if not numbers any more.
65. if (num == 0)
66. break;
67.
68. // quick sort.
69. qsort(space, num, sizeof(int), compare);
70. // create a new auxiliary file name.
71. string file_name = new_file_name(++counter);
72. FILE *fp_aux_file = fopen(file_name.c_str(), "w");
73. check_fp(fp_aux_file);
74.
75. // write the orderly numbers into auxiliary file.
76. write_data(fp_aux_file, space, num);
77. fclose(fp_aux_file);
78. delete []space;
79. }
80. fclose(fp_in_file);
81.
82. // return the number of auxiliary files.
83. return counter;
84.}
85.
86.void merge_sort(int file_num)
87.{
88. if (file_num <= 0)
89. return;
90. // create a new file to store result.
91. FILE *fp_out_file = fopen("result.txt", "w");
92. check_fp(fp_out_file);
93.
94. // allocate a array to store the file pointer.
95. FILE **fp_array = new FILE *[file_num];
96. int i;
97. for (i = 0; i < file_num; i++)
98. {
99. string file_name = new_file_name(i + 1);
100. fp_array[i] = fopen(file_name.c_str(), "r");
101. check_fp(fp_array[i]);
102. }
103.
104. int *first_data = new int[file_num];
105. //new出个大小为0.1亿/250k数组，由指针first_data指示数组首地址
106. bool *finish = new bool[file_num];
107. memset(finish, false, sizeof(bool) * file_num);
108.
109. // read the first number of every auxiliary file.
110. for (i = 0; i < file_num; i++)
111. fscanf(fp_array[i], "%d ", &first_data[i]);
112. while (true)
113. {
114. int index = 0;
115. while (index < file_num && finish[index])
116. index++;
117.
118. // the finish condition of the merge sort.
119. if (index >= file_num)
120. break;
121. //主要的修改在上面两行代码，就是merge sort结束条件。
122. //要保证所有文件都读完，必须使得finish[0]...finish[40]都为真
123. //July、yansha，555，2011.05.29。
124.
125. int min_data = first_data[index];
126. // choose the relative minimum in the array of first_data.
127. for (i = index + 1; i < file_num; i++)
128. {
129. if (min_data > first_data[i] && !finish[i])
130. //一旦发现比min_data更小的数据first_data[i]
131. {
132. min_data = first_data[i];
133. //则置min_data<-first_data[i]index = i;
134. //把下标i 赋给index。
135. }
136. }
137.
138. // write the orderly result to file.
139. fprintf(fp_out_file, "%d ", min_data);
140. if (fscanf(fp_array[index], "%d ", &first_data[index]) == EOF)
141. finish[index] = true;
142. }
143.
144. fclose(fp_out_file);
145. delete []finish;
146. delete []first_data;
147. for (i = 0; i < file_num; i++)
148. fclose(fp_array[i]);
149. delete [] fp_array;
150.}
151.
152.int main()
153.{
154. clock_t start_memory_sort = clock();
155. int aux_file_num = memory_sort();
156. clock_t end_memory_sort = clock();
157. cout << "The time needs in memory sort: " << end_memory_sort - start_memory_sort << endl;
158. clock_t start_merge_sort = clock();
159. merge_sort(aux_file_num);
160. clock_t end_merge_sort = clock();
161. cout << "The time needs in merge sort: " << end_merge_sort - start_merge_sort << endl;
162. system("pause");
163. return 0;
164.}