storm1.2.1-wordcount可靠的单词计数-2

源码下载:

https://download.youkuaiyun.com/download/adam_zs/10344447

在ReportBolt.java中,如果word.equals("f")则返回fail,目的为了测试消息处理的可靠性;

遇到fail错误,"e f" 这个句子会重新发送,在统计结果中e单词的统计次数明显多于其它单词数量;

运行结果:


package com.book.v3;

import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;

/**
 * @title: 数据源 <br/>
 * @author: wangzs <br/>
 * @date: 2018年3月18日
 */
public class SentenceSpout extends BaseRichSpout {

	private ConcurrentHashMap<UUID, Values> pending;
	private SpoutOutputCollector spoutOutputCollector;
	private String[] sentences = { "a b", "c d", "e f" };

	@Override
	public void open(Map map, TopologyContext topologycontext, SpoutOutputCollector spoutoutputcollector) {
		this.spoutOutputCollector = spoutoutputcollector;
		this.pending = new ConcurrentHashMap<UUID, Values>();
	}

	@Override
	public void nextTuple() {
		for (String sentence : sentences) {
			Values values = new Values(sentence);
			UUID msgId = UUID.randomUUID();
			this.spoutOutputCollector.emit(values, msgId);
			this.pending.put(msgId, values);
			System.out.println("SentenceSpout==> " + values + " msgId=" + msgId);
		}
		Utils.sleep(1000);
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer outputfieldsdeclarer) {
		outputfieldsdeclarer.declare(new Fields("sentence"));
	}

	@Override
	public void ack(Object msgId) {
		System.out.println("#####[ack]###### msgId=" + msgId + " values=" + this.pending.get(msgId));
		this.pending.remove(msgId);
	}

	@Override
	public void fail(Object msgId) {
		System.out.println("#####[fail]###### msgId=" + msgId + " values=" + this.pending.get(msgId));
		this.spoutOutputCollector.emit(this.pending.get(msgId), msgId);
	}

}
package com.book.v3;

import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

/**
 * @title: 分隔单词 <br/>
 * @author: wangzs <br/>
 * @date: 2018年3月18日
 */
public class SplitSentenceBolt extends BaseRichBolt {
	private OutputCollector outputCollector;

	@Override
	public void execute(Tuple tuple) {
		String sentence = tuple.getStringByField("sentence");
		String[] words = sentence.split(" ");
		for (String word : words) {
			this.outputCollector.emit(tuple, new Values(word));
		}
		this.outputCollector.ack(tuple);
		System.out.println("SplitSentenceBolt==> " + sentence + " msgId=" + tuple.getMessageId());
	}

	@Override
	public void prepare(Map map, TopologyContext topologycontext, OutputCollector outputcollector) {
		this.outputCollector = outputcollector;
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer outputfieldsdeclarer) {
		outputfieldsdeclarer.declare(new Fields("word"));
	}

}

package com.book.v3;

import java.util.HashMap;
import java.util.Map;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

/**
 * @title: 实现单词计数 <br/>
 * @author: wangzs <br/>
 * @date: 2018年3月18日
 */
public class WordCountBolt extends BaseRichBolt {
	private OutputCollector outputCollector;
	private HashMap<String, Integer> counts = null;

	@Override
	public void prepare(Map map, TopologyContext topologycontext, OutputCollector outputcollector) {
		this.outputCollector = outputcollector;
		this.counts = new HashMap<String, Integer>();
	}

	@Override
	public void execute(Tuple tuple) {
		String word = tuple.getStringByField("word");
		Integer count = counts.get(word);
		if (count == null) {
			count = 0;
		}
		count++;
		this.counts.put(word, count);
		this.outputCollector.emit(tuple, new Values(word, count));
		this.outputCollector.ack(tuple);
		System.out.println("WordCountBolt==> " + word + " msgId=" + tuple.getMessageId());
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer outputfieldsdeclarer) {
		outputfieldsdeclarer.declare(new Fields("word", "count"));
	}

}

package com.book.v3;

import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;

/**
 * @title: 实现上报bolt <br/>
 * @author: wangzs <br/>
 * @date: 2018年3月18日
 */
public class ReportBolt extends BaseRichBolt {
	private OutputCollector outputcollector;
	private HashMap<String, Integer> reportCounts = null;

	@Override
	public void prepare(Map map, TopologyContext topologycontext, OutputCollector outputcollector) {
		this.reportCounts = new HashMap<String, Integer>();
		this.outputcollector = outputcollector;
	}

	@Override
	public void execute(Tuple tuple) {
		String word = tuple.getStringByField("word");
		int count = tuple.getIntegerByField("count");

		if (word.equals("f")) {
			this.outputcollector.fail(tuple);
		} else {
			this.reportCounts.put(word, count);
			this.outputcollector.ack(tuple);
			System.out.println("ReportBolt==> " + word + " msgId=" + tuple.getMessageId());
		}

		System.out.println("~~~~~~~~~~~execute~~~~~~~~~~~");
		Set<Map.Entry<String, Integer>> entrySet = reportCounts.entrySet();
		for (Map.Entry<String, Integer> entry : entrySet) {
			System.out.println(entry);
		}
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer outputfieldsdeclarer) {

	}

	/**
	 * 在终止之前调用这个方法
	 */
	@Override
	public void cleanup() {
		Set<Map.Entry<String, Integer>> entrySet = reportCounts.entrySet();
		System.out.println("---------- FINAL COUNTS -----------");
		for (Map.Entry<String, Integer> entry : entrySet) {
			System.out.println(entry);
		}
		System.out.println("-----------------------------------");
	}

}

package com.book.v3;

import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
import org.apache.storm.utils.Utils;

/**
 * @title: 可靠的单词计数 <br/>
 * @author: wangzs <br/>
 * @date: 2018年3月18日
 */
public class WordCountTopology {
	public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException, AuthorizationException {
		SentenceSpout sentenceSpout = new SentenceSpout();
		SplitSentenceBolt splitSentenceBolt = new SplitSentenceBolt();
		WordCountBolt wordCountBolt = new WordCountBolt();
		ReportBolt reportBolt = new ReportBolt();

		TopologyBuilder builder = new TopologyBuilder();
		builder.setSpout("sentenceSpout-1", sentenceSpout);
		builder.setBolt("splitSentenceBolt-1", splitSentenceBolt).shuffleGrouping("sentenceSpout-1");
		builder.setBolt("wordCountBolt-1", wordCountBolt).fieldsGrouping("splitSentenceBolt-1", new Fields("word"));
		builder.setBolt("reportBolt-1", reportBolt).globalGrouping("wordCountBolt-1");

		Config conf = new Config();
		conf.setDebug(false);
		if (args != null && args.length > 0) {
			conf.setNumWorkers(1);
			StormSubmitter.submitTopologyWithProgressBar(args[0], conf, builder.createTopology());
		} else {
			LocalCluster cluster = new LocalCluster();
			StormTopology topology = builder.createTopology();
			cluster.submitTopology("wordCountTopology-1", conf, topology);
			Utils.sleep(40000);
			cluster.killTopology("wordCountTopology-1");
			cluster.shutdown();
		}
	}
}


### 实现 WordCount 功能的关键步骤 要在 Eclipse 中实现 WordCount 的功能,可以按照以下方式构建 MapReduce 程序并将其部署到 Hadoop 集群中运行。 #### 1. 创建 Java 项目 在 Eclipse 中创建一个新的 Java 项目,并配置好 Hadoop 所需的依赖库文件。可以通过 Maven 或手动导入 Hadoop JAR 文件来完成环境搭建[^3]。 #### 2. 编写 Mapper 类 Mapper 负责处理输入数据并将每行文本拆分为单词。以下是 `WordCountMapper` 的代码示例: ```java import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> { private final static LongWritable one = new LongWritable(1); private Text word = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws java.io.IOException, InterruptedException { String line = value.toString(); String[] words = line.split("\\s+"); for (String str : words) { word.set(str); context.write(word, one); } } } ``` 上述代码实现了将每一行中的单词提取出来,并为每个单词分配一个计数值 `1`[^1]。 #### 3. 编写 Reducer 类 Reducer 将来自 Mapper 的中间键值对聚合起来计算最终的结果。以下是 `WordCountReducer` 的代码示例: ```java import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws java.io.IOException, InterruptedException { long sum = 0; for (LongWritable val : values) { sum += val.get(); } context.write(key, new LongWritable(sum)); } } ``` 此部分通过迭代器累加相同单词对应的值,从而得到该单词在整个文档集合中的总次数。 #### 4. 主程序驱动逻辑 编写主函数用于设置作业参数以及提交任务给 Hadoop 运行框架。如下所示为主类定义: ```java import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountDriver { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCountDriver.class); // 设置 Driver Class job.setMapperClass(WordCountMapper.class); // 设置 Mapper Class job.setCombinerClass(WordCountReducer.class); // 可选 Combiner 提高性能 job.setReducerClass(WordCountReducer.class); // 设置 Reducer Class job.setOutputKeyClass(Text.class); // 输出 Key 类型 job.setOutputValueClass(LongWritable.class); // 输出 Value 类型 FileInputFormat.addInputPath(job, new Path(args[0])); // 输入路径 FileOutputFormat.setOutputPath(job, new Path(args[1])); // 输出路径 System.exit(job.waitForCompletion(true) ? 0 : 1); } } ``` 这段代码设置了整个 MapReduce 流程所需的各类组件及其行为模式。 #### 5. 构建与测试 完成后,在 Eclipse 上右击项目 -> Export -> Runnable Jar file 来导出可执行 JAR 包。随后上传至 HDFS 并利用命令启动任务: ```bash hadoop jar your-wordcount-jar-file.jar com.yourpackage.WordCountDriver /input/path /output/path ``` 这一步骤基于实际开发环境中所指定的具体目录位置调整相应变量名即可[^2]。 ---
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值