主程序:
package hdfs.wordcount;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
public class HDFSWordCount {
/**
* wordcount主流程,主要是读取数据,处理交给业务流程
*/
public static void main(String[] args) throws Exception {
/**
* 加载配置文件,反射机制获取业务类对象
*/
Properties prop = new Properties();
//将job.properties加载到类HDFSWordCount中
prop.load(HDFSWordCount.class.getClassLoader().getResourceAsStream("job.properties"));
//将业务类加载出来,这样可以随时更改prop的value,来使用同功能的业务类
Class<?> map_class = Class.forName(prop.getProperty("MAPPER_CLASS"));
Mapper mapper=(Mapper)map_class.newInstance();
Context context = new Context();
FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"), new Configuration(), "root");
RemoteIterator<LocatedFileStatus> iter = fs.listFiles(new Path("/wordcount/input"), false);
while (iter.hasNext()) {
LocatedFileStatus file = iter.next();
//读取文件内容,建立文件输入流
FSDataInputStream in = fs.open(file.getPath());
//输入流读进缓冲流
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line = null;
while((line=br.readLine())!=null) {
//调业务逻辑处理,使本流程成一个框架,下次业务内容更换时框架不用动,面向对象编程思想
//mapper 是通过prop文件加载出来类,实例化后的对象
mapper.map(line, context);
}
br.close();
in.close();
}
//判断文件是否存在,不存在的话创建
Path path = new Path("/wordcount/output/");
if (fs.exists(path)) {
throw new RuntimeException("文件已存在!");
}else {
fs.mkdirs(path);
}
HashMap<Object, Object> contextMap = context.getContextMap();
FSDataOutputStream out = fs.create(new Path("/wordcount/output",new Path("res.dat")));
Set<Entry<Object, Object>> entrySet = contextMap.entrySet();
for (Entry<Object, Object> entry : entrySet) {
out.write((entry.getKey().toString()+"\t"+entry.getValue().toString()+"\n").getBytes());
}
out.close();
fs.close();
System.out.println("wordcount统计完成!");
}
}
业务逻辑接口:
package hdfs.wordcount;
public interface Mapper {
public void map(String line,Context context);
}
业务处理方法:
package hdfs.wordcount;
public class WordCountMapper implements Mapper{
/**
* wordcount的业务处理逻辑
*/
@Override
public void map(String line, Context context) {
String[] words = line.split(" ");
for (String word : words) {
Object value = context.get(word);
if (value == null) {
context.write(word, 1);
}else {
int v = (int)value;
context.write(word, v+1);
}
}
}
}
context类,使用hashmao用来对单个word进行存放,以及word的计数
package hdfs.wordcount;
import java.util.HashMap;
/**
* Mapper接口中的map方法 接收了line参数和Context类,这个类时hashmap,key val形式,单词key,次数 value?
*/
public class Context {
private HashMap<Object, Object> contextMap = new HashMap<>();
//提供对外写入到hashmap方法
public void write(Object key,Object value) {
contextMap.put(key, value);
}
//提供对外访问key方法
public Object get(Object key) {
return contextMap.get(key);
}
//把结果对外提供访问方法。。
public HashMap<Object, Object> getContextMap() {
return contextMap;
}
}
忽略大小写的wordcount的业务处理方法:
package hdfs.wordcount;
public class IgnWordCountMapper implements Mapper{
@Override
public void map(String line, Context context) {
String[] words = line.toUpperCase().split(" ");
for (String word : words) {
Object value = context.get(word);
if (value == null) {
context.write(word, 1);
}else {
int v = (int)value;
context.write(word, v+1);
}
}
}
}
job.properties配置文件用来反射得到业务处理类
MAPPER_CLASS=hdfs.wordcount.WordCountMapper