Hadoop的PathFilter使用
源码接口定义:
public interface PathFilter {
/**
* Tests whether or not the specified abstract pathname should be
* included in a pathname list.
*
* @param path The abstract pathname to be tested
* @return <code>true</code> if and only if <code>pathname</code>
* should be included
*/
boolean accept(Path path);
}
用法:
static class TextPathFilter extends Configured implements PathFilter {
@Override
public boolean accept(Path path) {
FileSystem fs;
try {
fs = FileSystem.get(getConf());
FileStatus fstatus = fs.getFileStatus(path);
List<String> lstName = new ArrayList<String>();
lstName.add("input1");
lstName.add("input2");
lstName.add("input3");
lstName.add("input4");
if(fstatus.isDirectory()) { //是目录的话返回true
return true;
}
if(fstatus.isFile() && lstName.contains(fstatus.getPath().getParent().getName())) { //是文件的话且满足过滤条件返回true
return true;
}
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
}
Driver类写的:
FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //输入路径
FileInputFormat.setInputDirRecursive(job, true);// 递归输入
FileInputFormat.setInputPathFilter(job, TextPathFilter.class); //指定pathfilter类