Mahout之bayes算法学习（三）

最新推荐文章于 2023-05-02 10:31:57 发布

奥康姆剃刀

最新推荐文章于 2023-05-02 10:31:57 发布

阅读量868

点赞数

CC 4.0 BY-SA版权

文章标签： hadoop Bayes 20news-bydate Mahout

本文链接：https://blog.youkuaiyun.com/zhoujianfeng3/article/details/38011957

本文详细记录了一次尝试使用Hadoop处理大量数据的挑战与解决方案，包括序列化数据、向量化、数据拆分、训练模型及测试过程。重点解决了在大数据量下，序列化数据时遇到的Hadoop连接失败的问题，并提供了有效的应对策略。

本来只是简单的将 classify-20newsgroups.sh 中的linux 命令转换成java 代码；在已知相关java 工具类的情况下代码实现是很简单的。但是由于测试数据量太大，第一步序列化数据的时候就报错了；报错原因是 hadoop-eclipse-plugin 连接hadoop时，由于数据量过大，造成读取数据失败。说来也惭愧！！！这个bug我至今还没想到好的解决办法，困于自身苦无计策；也只能暂时搁置于此了。

有的时候，既然一时想不出好的方法，那就暂时跳过这个问题。测试其他工具类的方法，主函数代码如下：

//序列化文件
mahout_seqdirectory();
//向量化
mahout_seq2sparse();

//数据拆分训练数据和测试数据
mahout_split();
//产生训练模型,boolean参数设定是否为完全朴素贝叶斯
mahout_trainnb(true);
//检测测试数据,boolean参数设定是否为完全朴素贝叶斯
mahout_testnb(true);

提示：第一步，序列化可能有问题，建议如果执行报错，可以现在linux上面命令执行生成下一步需要的数据文件夹。

将java 打包jar；放到服务器上面运行。

工具类方法实现如下：

private static final String WORK_DIR = "hdfs://192.168.9.72:9000/tmp/mahout-work-java-sh";

/*
* echo "Converting sequence files to vectors"
./bin/mahout seq2sparse \
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
*
*/
public static void mahout_seq2sparse(){
try {
Configuration conf = new Configuration();

conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-seq";
String output = WORK_DIR+Path.SEPARATOR+"20news-vectors";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是，是否递归删除的意思
fs.delete(out, true);
}

SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);

}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败！");
System.out.println(2);
}
}

/*
* echo "Creating sequence files from 20newsgroups data"
./bin/mahout seqdirectory \
-i ${WORK_DIR}/20news-all \
-o ${WORK_DIR}/20news-seq -ow
*/
public static void mahout_seqdirectory(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-all";
String output = WORK_DIR+Path.SEPARATOR+"20news-seq";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是，是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败！");
System.exit(1);
}
}

/*
* echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
./bin/mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
*/
public static void mahout_split(){

try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-vectors"+Path.SEPARATOR+"tfidf-vectors";
String trainingOutput = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String testOutput = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";

Path in = new Path(input);
Path trainOut = new Path(trainingOutput);
Path testOut = new Path(testOutput);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(trainOut)){
//boolean参数是，是否递归删除的意思
fs.delete(trainOut, true);
}

if(fs.exists(testOut)){
//boolean参数是，是否递归删除的意思
fs.delete(testOut, true);
}

SplitInput si = new SplitInput();
String[] params = new String[]{"-i",input,"--trainingOutput",trainingOutput,"--testOutput",testOutput,
"--randomSelectionPct","40","--overwrite","--sequenceFiles","-xm","sequential"};
ToolRunner.run(si, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据拆分成训练数据和检测数据失败！");
System.exit(3);
}

}

/*
* echo "Training Naive Bayes model"
./bin/mahout trainnb \
-i ${WORK_DIR}/20news-train-vectors -el \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow $c
*/
public static void mahout_trainnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";

Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是，是否递归删除的意思
fs.delete(out, true);
}

if(fs.exists(label)){
//boolean参数是，是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败！");
System.exit(3);
}
}

/*
* echo "Testing on holdout set"
./bin/mahout testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing $c
*/
public static void mahout_testnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
String output = WORK_DIR+Path.SEPARATOR+"20news-testing";
Path in = new Path(input);
Path modelIn = new Path(model);
Path labelIn = new Path(labelindex);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in) && fs.exists(modelIn)&& fs.exists(labelIn)){
if(fs.exists(out)){
//boolean参数是，是否递归删除的意思
fs.delete(out, true);
}
TestNaiveBayesDriver tnbd = new TestNaiveBayesDriver();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow"};
}
ToolRunner.run(tnbd, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败！");
System.exit(3);
}
}