大文件排序
对于很大的文件中的内容进行排序,不能和普通的排序一样将所有的数据读取到内存,然后对数据进行排序,因此需要使用外部排序进行整个文件的排序。
基本思路
首先将文件分割一个个小的文件,对于每个小的文件的内容使用普通的排序方法进行排序,所有的文件排序完毕后,对这些文件两两合并(使用归并的排序的思想进行合并)。最后形成排完序的文件。
简单实现
package com.fengyangdi.sort;
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
/**
* 对于一个大文件进行排序
* Created by GGM on 2016/9/22.
*/
public class BigFileSort {
private static final int SPILT_SIZE = 10 * 10000;
private String parentPath;
private String filename;
public void sort(String fileName){
if (fileName == null || fileName.length() <= 0){
System.out.println("The filename is invalid");
return;
}
this.filename = fileName;
createDirSaveSplitFile(fileName);
splitFile(fileName);
//删除原文件
File f = new File(fileName);
f.delete();
//合并文件
try {
mergeFiles(0);
} catch (IOException e) {
e.printStackTrace();
}
deleteDirs();
}
private void deleteDirs() {
File file = new File(parentPath + "\\tmp0\\");
file.delete();
file = new File(parentPath + "\\tmp1\\");
file.delete();
}
private void mergeFiles(int level) throws IOException {
int mergeIndex = level;
int saveIndex = 1 - mergeIndex;
File mergeDir = new File(parentPath + "\\tmp" + mergeIndex + "\\");
File[] files = mergeDir.listFiles();
if (files == null || files.length <= 0) return;
if (files.length == 1){
copyFile(files[0], this.filename);
files[0].delete();
}
else if (files.length == 2){
megeTwoFiles(this.filename, files[0], files[1]);
files[0].delete();
files[1].delete();
}else{
int index = 0;
int i;
for (i = 0; i < files.length - 1; i += 2){
String fileName = parentPath + "\\tmp" + saveIndex + "\\tmp_" + index++;
megeTwoFiles(fileName, files[i], files[i+1]);
files[i].delete();
files[i+1].delete();
}
if (i == files.length - 1){
copyFile(files[i], parentPath + "\\tmp" + saveIndex + "\\tmp_" + index++);
files[i].delete();
}
mergeFiles(1-level);
}
}
private void copyFile(File file, String fileName) throws IOException {
File file1 = new File(fileName);
BufferedReader reader1 = new BufferedReader(new FileReader(file));
BufferedWriter writer = new BufferedWriter(new FileWriter(file1));
String n1;
n1 = reader1.readLine();
while (n1 != null){
int num1 = Integer.parseInt(n1);
writer.write(num1+"");
writer.newLine();
n1 = reader1.readLine();
}
reader1.close();
writer.close();
}
private void megeTwoFiles(String filenamem, File file1, File file2) throws IOException {
File file = new File(filenamem);
BufferedReader reader1 = new BufferedReader(new FileReader(file1));
BufferedReader reader2 = new BufferedReader(new FileReader(file2));
BufferedWriter writer = new BufferedWriter(new FileWriter(file));
String n1, n2;
n1 = reader1.readLine();
n2 = reader2.readLine();
while (n1 != null && n2 != null){
int num1 = Integer.parseInt(n1);
int num2 = Integer.parseInt(n2);
if (num1 <= num2) {
writer.write(num1+"");
writer.newLine();
n1 = reader1.readLine();
}else{
writer.write(num2+"");
writer.newLine();
n2 = reader2.readLine();
}
}
while (n1 != null){
int num1 = Integer.parseInt(n1);
writer.write(num1+"");
writer.newLine();
n1 = reader1.readLine();
}
while (n2 != null){
int num2 = Integer.parseInt(n2);
writer.write(num2+"");
writer.newLine();
n2 = reader2.readLine();
}
reader1.close();
reader2.close();
writer.close();
}
/**
* 创建临时文件存放目录
* @param fileName
*/
private void createDirSaveSplitFile(String fileName) {
File file = new File(fileName);
String fileParent;
if (file.isAbsolute()){
fileParent = file.getParent();
}else{
fileParent = new File(file.getAbsolutePath()).getParent();
}
parentPath = fileParent;
file = new File(fileParent + "\\tmp0\\");
if (!file.exists()) file.mkdir();
file = new File(fileParent + "\\tmp1\\");
if (!file.exists()) file.mkdir();
}
private void splitFile(String fileName) {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
int i = 0;
int count = 0;
String num = "";
ArrayList<Integer> list = new ArrayList<>();
while ( (num = br.readLine()) != null){
int number = Integer.parseInt(num);
list.add(number);
count++;
if (count == BigFileSort.SPILT_SIZE){
File file = new File(parentPath + "\\tmp0\\tmp_" + i++);
FileWriter writer = new FileWriter(file);
BufferedWriter bw = new BufferedWriter(writer);
Collections.sort(list);
for (Integer integer : list) {
bw.write(integer+"");
bw.newLine();
}
bw.close();
list.clear();
count = 0;
}
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
File file = new File("D:\\test\\data.txt");
Random random = new Random((int)(Math.random() * 100));
FileWriter writer = new FileWriter(file);
BufferedWriter bw = new BufferedWriter(writer);
for (int i = 0; i < 1; i++){
for (int j = 0; j < 10000000; j++){
bw.write(random.nextInt(20000)+"");
bw.newLine();
}
}
bw.close();
new BigFileSort().sort("D:\\test\\data.txt");
file = new File("D:\\test\\data.txt");
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
int count = 0;
while ((line = br.readLine()) != null){
System.out.print(line + " ");
count ++;
if (count == 50) {
System.out.println();
count = 0;
}
}
br.close();
}
}