前段时间出去面试被问到一个文件中存储了十亿个单词,如何统计每个单词出现的次数?其实思想就是map-reduce,今天有空写个demo试一下。抛开十亿个单词先不说,现在有100个单词你想想该如何统计? 很简单吧用hashmap如果key不存在则初始化value=1,如果key存在value+1。具体看下面demo
package mapreducetest;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
/**
* map-reduce统计文件每个单词出现的次数
*/
public class MapReduceTest {
private String filePath;
public MapReduceTest(String filePath) {
this.filePath = filePath;
}
/**
* 生成测试文件
* 默认生成一千万的随机数,数字位于0-10000之间,每一行存储20个,用空格分割。
*
* @throws IOException
*/
public void produceFile() throws IOException {
int size = 10000000;
int count = 0, rowCount = 1;
BufferedWriter bw = null;
Random random = new Random();
try {
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filePath)));
while (count++ < size) {
bw.write(random.nextInt(10000) + " ");
if (rowCount++ == 20) {
bw.newLine();
rowCount = 1;
}
}
bw.flush();
} finally {
bw.close();
}
}
/**
* reduce文件计数
* 计算每个数字出现的次数
* @return
* @throws IOException
*/
public Map<String, Integer> count() throws IOException {
Map<String, Integer> countMap = new HashMap<>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath)));
String line;
try{
while ((line = br.readLine()) != null) {
String[] values = line.split(" ");
for (String val : values) {
Integer oldVal = countMap.putIfAbsent