往事看到一道支付宝笔试题,自己做了一下,尽管效率不高,也是个人思考的结果。题目如下:
有一个100G大小的文件里存的全是数字,并且每个数字见用逗号隔开。现在在这一大堆数字中找出100个最大的数出来。
做法:
假设数字为4字节整数,逗号为2字节unicode字符,100G文件本人电脑无法容纳,所以取2亿整数,文件大小1.2G
1. 生成二进制文件(使用DataOutputStream,使用缓冲区,耗时79秒):
File file = new File("E:\\test.dat");
if (!file.exists()) {
file.createNewFile();
}
long time = System.currentTimeMillis();
DataOutputStream stream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
Random random = new Random();
long count = 200000000;
System.out.println(count+"is max long int in java");
int temp;
for (long i = 0; i < count; i++) {
temp = random.nextInt();
stream.writeInt(temp);
stream.writeChar(',');
}
System.out.println("循环完成");
stream.flush();
stream.close();
time = System.currentTimeMillis() - time;
System.out.println(time+"毫秒");
2. 分析文件(使用DataInputStream,使用缓冲区,耗时65秒)
a. 读取前100个整数
b. 排序,把排序后的数组看成堆,最小值在根节点
c. 遍历整个文件,把读到的数和最小值比较,如果比最新值小,则丢弃,如果比最小值大则替换最小值重建堆。
d. 文件读取完毕,堆中的元素就是要找的100个最大值,再执行一次排序。
TestRead.java
public static void main(String[] args) throws IOException, InterruptedException {
File file = new File("E:\\test.dat");
long time = System.currentTimeMillis();
DataInputStream stream = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
int len = 100;
long count = 100;
int arr[] = new int[100];
for (int i = 0; i < len; i++) {
arr[i] = stream.readInt();
stream.readChar();
}
Arrays.sort(arr);
print(arr);
int temp = 0;
while(true) {
try {
temp = stream.readInt();
stream.readChar();
count++;
if(temp > arr[0]) {
addToheap(arr,temp);
} else {
continue;
}
} catch(EOFException ioe) {
break;
}
}
stream.close();
time = System.currentTimeMillis() - time;
System.out.println(time+"毫秒"+":"+count+"个");
Arrays.sort(arr);
print(arr);
}
static void addToheap(int arr[], int temp){
arr[0] = temp;
int index = 0;
int left = 1;
int right = 2;
int minIndex = index;
while (left < arr.length) {
if (arr[index] > arr[left]) {
minIndex = left;
}
if (right < arr.length && arr[minIndex] > arr[right]) {
minIndex = right;
}
if (minIndex == index) {
break;
} else {
temp = arr[minIndex];
arr[minIndex] = arr[index];
arr[index] = temp;
index = minIndex;
left = 2*index + 1;
right = 2*index + 2;
}
}
}
static void print(int[] aa) {
for (int i = 0; i < aa.length; i++) {
System.out.print(aa[i] + ",");
if ((i + 1) % 10 == 0) {
System.out.println();
}
}
}
3. 使用内存映射,nio,代替DataInputStream,用时12秒,只能使用MyEclipse6.5jre, 使用jdk1.5,jdk1.6时
存储空间不足,堆空间不足。
4. 对文件进行分段映射(暂时分为10段),每个线程负责读取一段,找出该段最大的100个,
在找到10X100个数中找最大的100个,用时10秒,性能没有显著改善。
package test;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
public class MultiThreadReader {
/**
* @param args
*/
public static void main(String[] args) {
long time = System.currentTimeMillis();
long len = 200000000 * 6;
int reads = 200;
LinkedList<RandomReader> randomReaders = new LinkedList<RandomReader>();
RandomReader randomReader = null;
for(int i = 0; i < reads; i++) {
randomReader = new RandomReader(i*len/reads, len/reads/6);
randomReaders.add(randomReader);
new Thread(randomReader).start();
}
int numberNeedFound = 100;
int firstArr[] = new int[numberNeedFound];
boolean firstFound = false;
HashSet<RandomReader> set = new HashSet<RandomReader>();
try {
Thread.sleep(100);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
while(set.size() < reads) {
for(int i = 0; i < reads; i++) {
randomReader = randomReaders.get(i);
if (randomReader.done && !firstFound) {
firstFound = true;
firstArr = randomReader.arr;
set.add(randomReader);
} else if (firstFound && randomReader.done
&& !set.contains(randomReader)) {
set.add(randomReader);
for (int j = 0; j < randomReader.arr.length; j++) {
if (randomReader.arr[j]>firstArr[0]) {
TestRead.addToheap(firstArr, randomReader.arr[j]);
}
}
}
}
}
time = System.currentTimeMillis() - time;
Arrays.sort(firstArr);
TestRead.print(firstArr);
System.out.printf("使用时间%d秒\n", time);
}
}
class RandomReader implements Runnable {
long offset = 0;
long len = 10;
RandomAccessFile file;
boolean done = false;
int numberNeedFound = 100;
int arr[] = new int[numberNeedFound];
MappedByteBuffer buffer;
static int id = 0;
int sid;
public RandomReader(long offset, long len) {
sid = id++;
this.offset = offset;
this.len = len;
try {
file = new RandomAccessFile("E:\\test.dat", "r");
buffer = file.getChannel().map(FileChannel.MapMode.READ_ONLY, offset, len*6);
} catch (Exception e) {
e.printStackTrace();
}
}
public void run() {
int count = 0;
for (int i = 0; i < numberNeedFound; i++) {
arr[i] = buffer.getInt();
buffer.getChar();
count++;
}
Arrays.sort(arr);
try {
int temp = 0;
while (count < len) {
temp = buffer.getInt();
buffer.getChar();
count++;
if (temp > arr[0]) {
TestRead.addToheap(arr, temp);
}
if(count == len/2) {
System.out.printf("reader %d completed 50 percent\n", sid);
}
}
done = true;
System.out.printf("reader %d completed 100 percent count = %d \n", sid,count);
} catch (Exception e) {
System.out.printf("reader %d is dead count = %d\n", sid,count);
e.printStackTrace();
}
}
}