Mahout文本聚类学习之DocumentProcessor类

最新推荐文章于 2025-09-29 11:32:24 发布

转载最新推荐文章于 2025-09-29 11:32:24 发布 · 737 阅读

mahout 专栏收录该内容

11 篇文章

订阅专栏

本文介绍了一种基于MapReduce框架的高效分词方法，适用于大规模文本数据集的预处理，通过documentprocessor类实现自动化的分布式分词过程。

做为文本聚类实现的第一步对语料分词是必须的，而documentprocessor类提供了一个基于mapreduce对大量数据集分词的高效灵活的实现。高效是其基于mapreduce分布式计算框架，灵活是其提供了可扩展的分词接口可以对多种语言分词的支持。

　　下面就要深入一下类内部的流程进行学习：documentprocessor类，它只提供了一个静态方法tokenizeddocuments();

tokenizeddocuments(path, class<? extends analyzer>, path, configuration);

　　参数中设置了输入文件的路径，也就是前一步生成的文档集的序列文件；另一个是继承了lucene analyzer抽象类的一个子类，用于分词功能的扩展；第三个就是分词的输出路径；最后一个就是job的一个configruation对象。

 1 public static void tokenizedocuments(path input,
 2                                        class<? extends analyzer> analyzerclass,
 3                                        path output,
 4                                        configuration baseconf)
 5     throws ioexception, interruptedexception, classnotfoundexception {
 6     configuration conf = new configuration(baseconf);
 7     // this conf parameter needs to be set enable serialisation of conf values
 8     conf.set("io.serializations", "org.apache.hadoop.io.serializer.javaserialization,"
 9                                   + "org.apache.hadoop.io.serializer.writableserialization"); 
10     //对分词的类进行设置，到时会直接实例化分词类的对象
11     conf.set(analyzer_class, analyzerclass.getname());
12     job job = new job(conf);
13     job.setjobname("documentprocessor::documenttokenizer: input-folder: " + input);
14     job.setjarbyclass(documentprocessor.class);
15     //输出键值为text做为文档的唯一标识
16     job.setoutputkeyclass(text.class);
17     //stringtuple对象中有一个list<string>对象，可以理解为分词后将文档存储为词组的序列
18     job.setoutputvalueclass(stringtuple.class);
19     fileinputformat.setinputpaths(job, input);
20     fileoutputformat.setoutputpath(job, output);
21     //sequencefiletokenizermapper是分词核心类
22     job.setmapperclass(sequencefiletokenizermapper.class);
23     job.setinputformatclass(sequencefileinputformat.class);
24     job.setnumreducetasks(0);
25     job.setoutputformatclass(sequencefileoutputformat.class);
26     //运行job前删除已经存在的目录
27     hadooputil.delete(conf, output);
28     //将job提交到hadoop集群并等待其结束 
29     boolean succeeded = job.waitforcompletion(true);
30     if (!succeeded) 
31       throw new illegalstateexception("job failed!");
32 
33   }

下面对seqencefiletokenizermapper进行分析

 1 public class sequencefiletokenizermapper extends mapper<text, text, text, stringtuple> {
 2 
 3   private analyzer analyzer;
 4 
 5   @override
 6   protected void map(text key, text value, context context) throws ioexception, interruptedexception {
 7       //调用分词提供的方法对value也就是文本正文的内容进行分词处理
 8     tokenstream stream = analyzer.reusabletokenstream(key.tostring(), new stringreader(value.tostring()));
 9     chartermattribute termatt = stream.addattribute(chartermattribute.class);
10     stringtuple document = new stringtuple();
11     stream.reset();
12     while (stream.incrementtoken()) {
13       if (termatt.length() > 0) {
14          //把词组加入stringtuple
15         document.add(new string(termatt.buffer(), 0, termatt.length()));
16       }
17     }
18     context.write(key, document);
19   }
20 
21   @override
22   protected void setup(context context) throws ioexception, interruptedexception {
23     super.setup(context);
24     //map任务开始时会从context中取出configuration对象，解析出分词组件的名称并生成对象的实例
25     analyzer = classutils.instantiateas(context.getconfiguration().get(documentprocessor.analyzer_class,
26                                                                        defaultanalyzer.class.getname()),
27                                         analyzer.class);
28   }
29 }