外部排序--归并算法实现

前提:在你看这篇文章之前假设你已经了解过外部排序的思想,所以在这里外部排序的基本原理就不加赘述了。

基本要点:外部排序顾名思义是对外部存储空间中数据的排序,那为什么不能像选择排序、插入排序、快速排序那样也直接进行排序呢?原因是外部存储的数据量可能是非常大的,而计算机的内存大小要远远小于外存,计算机一下子读不了那么多数据,无法一次性对它们进行排序,这就是外部排序产生的原因。

基本思想

  1. 我们这里借助归并排序的思想(这也是外部排序中最基本的思想,假设大家已经对归并排序有一定了解),假设我的一个文件中有10000个数据,而我的内存每次只能读2000个数据,那我先对文件预处理一下,将原文件切割成5个小文件,每个文件中有2000个有序数据(在读入内存2000个数据后,对数据排序后再写入到新文件中)。

  2. 然后我们开始进行文件合并(这里采用2-路归并)。先分别打开两个文件,分别读取文件的第一行数据,把数据较小的写入到一个新文件中,然后把数据较小的文件再往下读一行,直到两个文件中的数据全部有序地写到新文件中为止,这样原先5个2000个数据的文件变为2个4000个数据和1个2000个数据的文件。

  3. 重复步骤2,再次合并变为1个8000个数据和1个2000个数据的文件,再次合并变为1个10000个数据的文件。排序完成,原先有10000个数据的文件变得有序。

下面是模拟外部排序的代码:

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<string.h>
#define MAXNUM 2000 

int filenum;
int filenumtemp;
int filenumend;

void CreatFile()
{
    FILE *f;
    f = fopen("test.txt", "w+");
    srand((unsigned)time(NULL));
    for (int i = 0; i < 10000; ++i)
    {
        int temp = rand() % 100; //产生0-100的随机数
        fprintf(f, "%d\n", temp);

    }
    fclose(f);
}

void merge(int num[], int start, int mid, int end)
{
    int n1 = mid - start + 1;
    int n2 = end - mid;
    int *left, *right;
    left = (int*)malloc(n1 * sizeof(int));
    right = (int*)malloc(n2 * sizeof(int));
    int i, j, k;

    for (i = 0; i < n1; i++)
        left[i] = num[start + i];
    for (j = 0; j < n2; j++)
        right[j] = num[mid + 1 + j];

    i = j = 0;
    k = start;
    while (i < n1 && j < n2)
        if (left[i] < right[j])
            num[k++] = left[i++];
        else
            num[k++] = right[j++];

    while (i < n1)
        num[k++] = left[i++];

    while (j < n2)
        num[k++] = right[j++];

    free(left);
    free(right);
}

void merge_sort(int num[], int start, int end)
{
    int mid;
    if (start < end)
    {
        mid = (start + end) / 2;

        merge_sort(num, start, mid);
        merge_sort(num, mid + 1, end);
        merge(num, start, mid, end);
    }
}

void MergeFile()
{
    filenumtemp = 0;
    while (filenum != 1)
    {
        while (filenumtemp < filenum)
        {
            if ((filenum - filenumtemp) == 1)
            {
                FILE *f1, *f;

                char filename1[10] = { "" };
                filenumtemp++;
                filename1[0] = filenumtemp + 48;
                strcat(filename1, ".txt");
                f1 = fopen(filename1, "r");

                filenumend++;
                char filename[10] = { "" };
                filename[0] = filenumend + 48;
                strcat(filename, "temp.txt");
                f = fopen(filename, "w+");

                int num1;
                while (fscanf(f1, "%d", &num1) != EOF)
                {
                    fprintf(f, "%d\n", num1);
                }
                fclose(f1);
                fclose(f);
            }
            else
            {
                FILE *f1, *f2, *f;
                char filename1[10] = { "" };
                filenumtemp++;
                filename1[0] = filenumtemp + 48;
                strcat(filename1, ".txt");
                f1 = fopen(filename1, "r");

                char filename2[10] = { "" };
                filenumtemp++;
                filename2[0] = filenumtemp + 48;
                strcat(filename2, ".txt");
                f2 = fopen(filename2, "r");

                filenumend++;
                char filename[10] = { "" };
                filename[0] = filenumend + 48;
                strcat(filename, "temp.txt");
                f = fopen(filename, "w+");

                int temp;
                int count = 0;
                int num1, num2;
                fscanf(f1, "%d", &num1);
                fscanf(f2, "%d", &num2);
                while (true)
                {
                    if (num1 < num2)
                    {
                        fprintf(f, "%d\n", num1);
                        if (fscanf(f1, "%d", &num1) == EOF)
                        {
                            fprintf(f, "%d\n", num2);
                            while (fscanf(f2, "%d", &num2) != EOF)
                            {
                                fprintf(f, "%d\n", num2);
                            }
                            break;
                        }
                    }
                    else
                    {
                        fprintf(f, "%d\n", num2);
                        if (fscanf(f2, "%d", &num2) == EOF)
                        {
                            fprintf(f, "%d\n", num1);
                            while (fscanf(f1, "%d", &num1) != EOF)
                            {
                                fprintf(f, "%d\n", num1);
                            }
                            break;
                        }
                    }
                }
                fclose(f1);
                fclose(f2);
                fclose(f);
            }

            char filename1[10] = { "" };
            char filename2[10] = { "" };
            filename1[0] = filenumend + 48;
            filename2[0] = filenumend + 48;
            strcat(filename1, "temp.txt");
            strcat(filename2, ".txt");

            char filename3[10] = { "" };
            char filename4[10] = { "" };
            filename3[0] = filenumend * 2 - 1 + 48;
            filename4[0] = filenumend * 2 + 48;
            strcat(filename3, ".txt");
            strcat(filename4, ".txt");

            int r1 = remove(filename3);
            int r2 = remove(filename4);
            printf("r1=%d r2=%d\n", r1, r2);
            rename(filename1, filename2);

            //printf("filenum=%d filenumtemp=%d filenumend=%d\n", filenum, filenumtemp, filenumend);
            if (filenumtemp == filenum&&filenum != 1)
            {
                filenum = filenumend;
                filenumtemp = 0;
                filenumend = 0;
            }
        }
    }
    char filename1[20] = { "" };
    char filename2[20] = { "" };
    filename1[0] = 1 + 48;
    strcat(filename1, ".txt");
    strcat(filename2, "test_sort.txt");
    rename(filename1, filename2);
    printf("排序完成,有序序列保存在:test_sort.txt文件中\n");
}

void CreatTempFile(int temp[], int count)
{
    FILE *f;
    char filename[10] = { "" };
    filename[0] = filenum + 48;
    strcat(filename, ".txt");
    f = fopen(filename, "w+");
    for (int i = 0; i < count; ++i)
    {
        fprintf(f, "%d\n", temp[i]);
    }
    fclose(f);
}

void SortFile()
{
    FILE *f;
    f = fopen("test.txt", "r");
    int *temp;
    temp = (int *)malloc(MAXNUM * sizeof(int));
    char tempchar;
    int count = 0;
    while (fscanf(f, "%d", &temp[count])!=EOF)
    {
        count++;
        if (count == MAXNUM)
        {
            filenum++;
            merge_sort(temp, 0, count - 1);
            CreatTempFile(temp, count);
            count = 0;
        }
    }
    if (count != 0)
    {
        filenum++;
        CreatTempFile(temp, count);
        count = 0;
    }
    fclose(f);
    free(temp);
}

int main()
{

    CreatFile(); //生成10000个随机数存储在test.txt文件中
    SortFile(); //初次切割并排序为有序文件
    MergeFile(); //对文件进行归并排序

    return 0;
}

:使用败者树进行多路合并可加快文件的排序速度,这里只使用了2-路归并排序,对原文件的预处理排序可使用内部排序的任何一种算法,当然效率越快,排序稳定最好。

先让我们看看原题的三个任务介绍: Task 1: Sorting the LINEITEM table by External Merge Sort Consider two cases: 1) using 5 buffer pages in memory for the external merge sort; 2) using 129 buffer pages in memory for the external merge sort. In the implementation, each buffer page occupies 8K bytes. The ORDERKEY attribute of the LINEITEM table is assumed to be the sort key in the external merge sort. Please report the number of passes and also the running time of the external merge sort in each case. Task 2: Organizing the sorted LINEITEM table into disk pages Please use the page format for storing variable-length records to organize the LINEITEM table sorted in Task 1. In the implementation, each disk page occupies 1K bytes. For each page we maintain a directory of slots, with a pair per slot. Both “record offset” and “record length” are 4 bytes wide. Task 3: Building a B-Tree over LINEITEM disk pages by Bulk Loading. Please use bulk loading to build a B-Tree over the disk pages of the LINEITEM table, which are generated in Task 2. The ORDERKEY attribute of the LINEITEM table is used as the (search) key for building the B-Tree. In the B-Tree, each internal node corresponds to a page of 1K bytes, both key and pointer are 4 bytes wide. Please report the running time of the bulk loading. A query interface is required for checking the B-Tree. For a reasonable ORDERKEY value, please print out all the pages visited along the path to find the corresponding record. Please also report the running time of the search.
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值