// Hadoop的FileUtil工具类中提供了copyMerge()方法,
// 它专门用来将一个HDFS目录下的所有文件合并成一个文件并输出,其源码如下
public static boolean copyMerge(FileSystem srcFS, Path srcDir,
FileSystem dstFS, Path dstFile,
boolean deleteSource,
Configuration conf, String addString) throws IOException {
dstFile = checkDest(srcDir.getName(), dstFS, dstFile, false);
if (!srcFS.getFileStatus(srcDir).isDirectory())
return false;
OutputStream out = dstFS.create(dstFile);
try {
FileStatus contents[] = srcFS.listStatus(srcDir);
Arrays.sort(contents);
for (int i = 0; i < contents.length; i++) {
if (contents[i].isFile()) {
InputStream in = srcFS.open(contents[i].getPath());
try {
IOUtils.copyBytes(in, out, conf, false);
if (addString!=null)
out.write(addString.getBytes("UTF-8"));
} finally {
in.close();
}
}
}
} finally {
out.close();
}
if (deleteSource) {
return srcFS.delete(srcDir, true);
} else {
return true;
}
}
方式二 : 通过命令 Usage: hadoop fs [generic options] -getmerge [-nl] [-skip-empty-file]
将hdfs小文件合并到本地,然后删除hdfs原文件,再将本地文件上传到hdfs
hadoop fs -getmerge -skip-empty-file /hdfspath/* /localpath/test.txt
hdfs dfs -put /localpath/test.txt /hdfspath/