Mahout之bayes算法学习(三)

本文详细记录了一次尝试使用Hadoop处理大量数据的挑战与解决方案,包括序列化数据、向量化、数据拆分、训练模型及测试过程。重点解决了在大数据量下,序列化数据时遇到的Hadoop连接失败的问题,并提供了有效的应对策略。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本来只是简单的将 classify-20newsgroups.sh 中的linux 命令转换成java 代码;在已知 相关java 工具类的情况下 代码实现是很简单的。 但是由于 测试数据量太大,第一步序列化数据的时候就报错了;报错原因是 hadoop-eclipse-plugin 连接hadoop时,由于数据量过大,造成读取数据失败。说来也惭愧!!!这个bug我至今还没想到好的解决办法,困于自身苦无计策;也只能暂时搁置于此了。

有的时候,既然一时想不出好的方法,那就暂时跳过这个问题。 测试 其他工具类的方法,主函数代码如下:

//序列化文件
mahout_seqdirectory();
//向量化
mahout_seq2sparse();

//数据拆分 训练数据 和 测试数据
mahout_split();
//产生训练模型,boolean参数设定是否为 完全朴素贝叶斯
mahout_trainnb(true);
//检测测试数据,boolean参数设定是否为 完全朴素贝叶斯
mahout_testnb(true);

提示: 第一步,序列化可能有问题,建议如果执行报错,可以现在linux上面命令执行生成 下一步需要的数据文件夹。

将java 打包jar;放到服务器上面运行。

工具类方法实现如下:

private static final String WORK_DIR = "hdfs://192.168.9.72:9000/tmp/mahout-work-java-sh";

/*
*   echo "Converting sequence files to vectors"
 ./bin/mahout seq2sparse \
   -i ${WORK_DIR}/20news-seq \
   -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf

*/
public static void mahout_seq2sparse(){
try {
Configuration conf = new Configuration();

conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));
String input = WORK_DIR+Path.SEPARATOR+"20news-seq";
String output = WORK_DIR+Path.SEPARATOR+"20news-vectors";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}

SparseVectorsFromSequenceFiles svfsf = new SparseVectorsFromSequenceFiles();
String[] params = new String[]{"-i",input,"-o",output,"-lnorm","-nv","-wt","tfidf"};
ToolRunner.run(svfsf, params);

}



} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("序列化文件转换成向量失败!");
System.out.println(2);
}
}



/*
*   echo "Creating sequence files from 20newsgroups data"
 ./bin/mahout seqdirectory \
   -i ${WORK_DIR}/20news-all \
   -o ${WORK_DIR}/20news-seq -ow
*/
public static void mahout_seqdirectory(){
try {
Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-all";
String output = WORK_DIR+Path.SEPARATOR+"20news-seq";

Path in = new Path(input);
Path out = new Path(output);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
SequenceFilesFromDirectory sffd = new SequenceFilesFromDirectory();
String[] params = new String[]{"-i",input,"-o",output};
ToolRunner.run(sffd, params);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("文件序列化失败!");
System.exit(1);
}
}




/*
*   echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
 ./bin/mahout split \
   -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
   --trainingOutput ${WORK_DIR}/20news-train-vectors \
   --testOutput ${WORK_DIR}/20news-test-vectors  \
   --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
*/
public static void mahout_split(){

try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-vectors"+Path.SEPARATOR+"tfidf-vectors";
String trainingOutput = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String testOutput = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";

Path in = new Path(input);
Path trainOut = new Path(trainingOutput);
Path testOut = new Path(testOutput);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(trainOut)){
//boolean参数是,是否递归删除的意思
fs.delete(trainOut, true);
}

if(fs.exists(testOut)){
//boolean参数是,是否递归删除的意思
fs.delete(testOut, true);
}

SplitInput si = new SplitInput();
String[] params = new String[]{"-i",input,"--trainingOutput",trainingOutput,"--testOutput",testOutput,
"--randomSelectionPct","40","--overwrite","--sequenceFiles","-xm","sequential"};
ToolRunner.run(si, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("数据拆分成 训练数据 和 检测数据 失败!");
System.exit(3);
}

}


/*
*   echo "Training Naive Bayes model"
 ./bin/mahout trainnb \
   -i ${WORK_DIR}/20news-train-vectors -el \
   -o ${WORK_DIR}/model \
   -li ${WORK_DIR}/labelindex \
   -ow $c
*/
public static void mahout_trainnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-train-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";

Path in = new Path(input);
Path out = new Path(model);
Path label = new Path(labelindex);

FileSystem fs = FileSystem.get(conf);

if(fs.exists(in)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}

if(fs.exists(label)){
//boolean参数是,是否递归删除的意思
fs.delete(label, true);
}
TrainNaiveBayesJob tnbj = new TrainNaiveBayesJob();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-el","-o",model,"-li",labelindex,"-ow"};
}
ToolRunner.run(tnbj, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}


/*
*   echo "Testing on holdout set"
 ./bin/mahout testnb \
   -i ${WORK_DIR}/20news-test-vectors\
   -m ${WORK_DIR}/model \
   -l ${WORK_DIR}/labelindex \
   -ow -o ${WORK_DIR}/20news-testing $c
*/
public static void mahout_testnb(boolean completelyNB){
try {

Configuration conf = new Configuration();
conf.addResource(new Path("/usr/local/hadoop/conf/core-site.xml"));

String input = WORK_DIR+Path.SEPARATOR+"20news-test-vectors";
String model = WORK_DIR+Path.SEPARATOR+"model";
String labelindex = WORK_DIR+Path.SEPARATOR+"labelindex";
String output = WORK_DIR+Path.SEPARATOR+"20news-testing";
Path in = new Path(input);
Path modelIn = new Path(model);
Path labelIn = new Path(labelindex);
Path out = new Path(output);


FileSystem fs = FileSystem.get(conf);

if(fs.exists(in) && fs.exists(modelIn)&& fs.exists(labelIn)){
if(fs.exists(out)){
//boolean参数是,是否递归删除的意思
fs.delete(out, true);
}
TestNaiveBayesDriver tnbd = new TestNaiveBayesDriver();
String[] params =null;
if(completelyNB){
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow","-c"};
}else{
params = new String[]{"-i",input,"-m",model,"-l",labelindex,"-o",output,"-ow"};
}
ToolRunner.run(tnbd, params);
}

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("生成训练模型失败!");
System.exit(3);
}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值