本来只是简单的将 classify-20newsgroups.sh 中的linux 命令转换成java 代码;在已知 相关java 工具类的情况下 代码实现是很简单的。 但是由于 测试数据量太大,第一步序列化数据的时候就报错了;报错原因是 hadoop-eclipse-plugin 连接hadoop时,由于数据量过大,造成读取数据失败。说来也惭愧!!!这个bug我至今还没想到好的解决办法,困于自身苦无计策;也只能暂时搁置于此了。
有的时候,既然一时想不出好的方法,那就暂时跳过这个问题。 测试 其他工具类的方法,主函数代码如下:
//序列化文件
mahout_seqdirectory();
//向量化
mahout_seq2sparse();
//数据拆分 训练数据 和 测试数据
mahout_split();
//产生训练模型,boolean参数设定是否为 完全朴素贝叶斯
mahout_trainnb(true);
//检测测试数据,boolean参数设定是否为 完全朴素贝叶斯
mahout_testnb(true);
提示: 第一步,序列化可能有问题,建议如果执行报错,可以现在linux上面命令执行生成 下一步需要的数据文件夹。
将java 打包jar;放到服务器上面运行。
工具类方法实现如下:
private static final String WORK_DIR = "hdfs://192.168.9.72:9000/tmp/mahout-work-java-sh";
/*
* echo "Converting sequence files to vectors"
./bin/mahout seq2sparse \
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
*
*/
public static void mahout_seq2sparse(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-seq";
String output = WORK_DIR+Path.SEPARATOR+"20news-vectors";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}
/*
* echo "Creating sequence files from 20newsgroups data"
./bin/mahout seqdirectory \
-i ${WORK_DIR}/20news-all \
-o ${WORK_DIR}/20news-seq -ow
*/
public static void mahout_seqdirectory(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-all";
String output = WORK_DIR+Path.SEPARATOR+"20news-seq";
Path in = new Path(input);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
}
/*
* echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
./bin/mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
*/
public static void mahout_split(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-vectors"+Path.SEPARATOR+"tfidf-vectors";
String trainingOutput = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String testOutput = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
Path in = new Path(input);
Path trainOut = new Path(trainingOutput);
Path testOut = new Path(testOutput);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(trainOut)){
//boolean参数是,是否递归删除的意思
fs.delete(trainOut, true);
}
if(fs.exists(testOut)){
//boolean参数是,是否递归删除的意思
fs.delete(testOut, true);
}
SplitInput si = new SplitInput();
String[] params = new String[]{"-i",input,"--trainingOutput",trainingOutput,"--testOutput",testOutput,
"--randomSelectionPct","40","--overwrite","--sequenceFiles","-xm","sequential"};
ToolRunner.run(si, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据拆分成 训练数据 和 检测数据 失败!");
System.exit(3);
}
}
/*
* echo "Training Naive Bayes model"
./bin/mahout trainnb \
-i ${WORK_DIR}/20news-train-vectors -el \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow $c
*/
public static void mahout_trainnb(boolean completelyNB){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
if(fs.exists(label)){
//boolean参数是,是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}
/*
* echo "Testing on holdout set"
./bin/mahout testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing $c
*/
public static void mahout_testnb(boolean completelyNB){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
String output = WORK_DIR+Path.SEPARATOR+"20news-testing";
Path in = new Path(input);
Path modelIn = new Path(model);
Path labelIn = new Path(labelindex);
Path out = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(in) && fs.exists(modelIn)&& fs.exists(labelIn)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
TestNaiveBayesDriver tnbd = new TestNaiveBayesDriver();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow"};
}
ToolRunner.run(tnbd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}