统计文本中的单词出现的频率,其中文本内容如下:
创建项目
项目结构如下:
创建pom.xml,代码如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.toto.strom</groupId>
<artifactId>wordCountStromDemo</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<!--<scope>provided</scope>-->
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>1.1.0</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.8.2</artifactId>
<version>0.8.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<!--告诉运行的主类是哪个,注意根据自己的情况,下面的包名做相应的修改-->
<mainClass>cn.toto.strom.wordcount.StormTopologyDriver</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
注意其中的mainClass配置,根据自己的项目情况,包名要做相应的变化
使用spout读取数据,其中MyLocalFileSpout的代码如下:
package cn.toto.strom.wordcount;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Created by maoxiangyi on 2016/8/16.
*/
public class MyLocalFileSpout extends BaseRichSpout {
private SpoutOutputCollector collector;
private BufferedReader bufferedReader;
//初始化方法
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.collector = collector;
try {
this.bufferedReader = new BufferedReader(new FileReader(new File("/home/tuzq/software/stormInstallPath/workdir/aaa.txt")));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
//Storm实时计算的特性就是对数据一条一条的处理
//while(true){
// this.nextTuple()
// }
public void nextTuple() {
//每被调用一次就会发送一条数据出去
try {
String line = bufferedReader.readLine();
if (StringUtils.isNotBlank(line)){
List<Object> arrayList = new ArrayList<Object>();
arrayList.add(line);
collector.emit(arrayList);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("juzi"));
}
}
使用bolt对单词进行分割,MySplitBolt的代码如下:
package cn.toto.strom.wordcount;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
public class MySplitBolt extends BaseBasicBolt {
public void execute(Tuple input, BasicOutputCollector collector) {
//1、数据如何获取
String juzi = (String)input.getValueByField("juzi");
//2、进行切割
String[] strings = juzi.split(" ");
//3、发送数据
for (String word : strings){
//Values 对象帮我们生成一个list
collector.emit(new Values(word,1));
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word","num"));
}
}
使用Bolt对单词进行统计,MyWordCountAndPrintBolt的代码如下:
package cn.toto.strom.wordcount;
import org.apache.storm.topology.BasicOutputCollector;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseBasicBolt;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Map;
/**
* 代码说明
*
* @author tuzq
* @create 2017-06-20 16:50
*/
public class MyWordCountAndPrintBolt extends BaseBasicBolt {
private Map<String, Integer> wordCountMap = new HashMap<String, Integer>();
public void execute(Tuple input, BasicOutputCollector collector) {
String word = (String) input.getValueByField("word");
Integer num = (Integer) input.getValueByField("num");
//1、查看单词对应的value是否存在
Integer integer = wordCountMap.get(word);
if (integer == null || integer.intValue() == 0) {
wordCountMap.put(word,num);
}else {
wordCountMap.put(word,integer.intValue() + num);
}
//2、打印数据
System.out.println(wordCountMap);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
//todo 不需要定义输出的字段
}
}
使用TopologyDriver串联spout和bolt进行运行,代码如下:
package cn.toto.strom.wordcount;/**
* Created by toto on 2017/6/20.
*/
import org.apache.storm.Config;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.topology.TopologyBuilder;
/**
* 代码说明
*
* @author tuzq
* @create 2017-06-20 16:57
*/
public class StormTopologyDriver {
public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {
//1、准备任务信息
TopologyBuilder topologyBuilder = new TopologyBuilder();
topologyBuilder.setSpout("mySpout", new MyLocalFileSpout(),1);
topologyBuilder.setBolt("bolt1", new MySplitBolt(),4).shuffleGrouping("mySpout");
topologyBuilder.setBolt("bolt2", new MyWordCountAndPrintBolt(),2).shuffleGrouping("bolt1");
//2、任务提交
//提交给谁?提交什么内容?
Config config = new Config();
config.setNumWorkers(2);
StormTopology stormTopology = topologyBuilder.createTopology();
//本地模式
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("wordcount", config, stormTopology);
//集群模式
//StormSubmitter.submitTopology("wordcount1", config, stormTopology);
}
}
如果是集群模式运行,StormTopologyDriver的代码是:
package cn.toto.strom.wordcount;
import org.apache.storm.Config;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.topology.TopologyBuilder;
/*
* @author tuzq
* @create 2017-06-20 16:57
*/
public class StormTopologyDriver {
public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException {
//1、准备任务信息
TopologyBuilder topologyBuilder = new TopologyBuilder();
//使用2个线程来运行
topologyBuilder.setSpout("mySpout", new MyLocalFileSpout(),2);
//使用4个线程来运行
topologyBuilder.setBolt("bolt1", new MySplitBolt(),4).shuffleGrouping("mySpout");
//使用2个线程来运行
topologyBuilder.setBolt("bolt2", new MyWordCountAndPrintBolt(),2).shuffleGrouping("bolt1");
//2、任务提交
//提交给谁?提交什么内容?
Config config = new Config();
config.setNumWorkers(2);
StormTopology stormTopology = topologyBuilder.createTopology();
//本地模式
//LocalCluster localCluster = new LocalCluster();
//localCluster.submitTopology("wordcount", config, stormTopology);
//集群模式
StormSubmitter.submitTopology("wordcount1", config, stormTopology);
}
}
StormTopologyDriver 的代码说明:
1.上面有2个worker
2.spout的两个并行度平均分配在两个worker上。每个组件的task数量会被平均分配到worker
3.bolt1的4个并行度平均分配在两个worker上。
4.bolt2的2个并行度平均分配在两个worker上。
一般将多个并行度中的实例,叫做task,默认情况下,一个bolt的并行度是4,代表了4个task.
本地模式运行
可以直接右键Run运行,最终运行的结果如下:
集群模式运行
在idean中对maven项目打包:
由于集群模式下已经有了strom-core-1.1.0XXX.jar,所以在package之前,要修改pom文件,修改storm-core的依赖为(也就是说加上provided,如果是本地模式需要注释这个):
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<scope>provided</scope>
<version>1.1.0</version>
</dependency>
如果不修改,将会报如下的错误:
Exception in thread "main" java.lang.ExceptionInInitializerError
at org.apache.storm.config$read_storm_config.invoke(config.clj:78)
at org.apache.storm.config$fn__908.invoke(config.clj:100)
at org.apache.storm.config__init.load(Unknown Source)
at org.apache.storm.config__init.<clinit>(Unknown Source)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at clojure.lang.RT.classForName(RT.java:2154)
at clojure.lang.RT.classForName(RT.java:2163)
at clojure.lang.RT.loadClassForName(RT.java:2182)
at clojure.lang.RT.load(RT.java:436)
at clojure.lang.RT.load(RT.java:412)
at clojure.core$load$fn__5448.invoke(core.clj:5866)
at clojure.core$load.doInvoke(core.clj:5865)
at clojure.lang.RestFn.invoke(RestFn.java:408)
at clojure.core$load_one.invoke(core.clj:5671)
at clojure.core$load_lib$fn__5397.invoke(core.clj:5711)
at clojure.core$load_lib.doInvoke(core.clj:5710)
at clojure.lang.RestFn.applyTo(RestFn.java:142)
at clojure.core$apply.invoke(core.clj:632)
at clojure.core$load_libs.doInvoke(core.clj:5753)
at clojure.lang.RestFn.applyTo(RestFn.java:137)
at clojure.core$apply.invoke(core.clj:634)
at clojure.core$use.doInvoke(core.clj:5843)
at clojure.lang.RestFn.invoke(RestFn.java:408)
at org.apache.storm.command.config_value$loading__5340__auto____12276.invoke(config_value.clj:16)
at org.apache.storm.command.config_value__init.load(Unknown Source)
at org.apache.storm.command.config_value__init.<clinit>(Unknown Source)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at clojure.lang.RT.classForName(RT.java:2154)
at clojure.lang.RT.classForName(RT.java:2163)
at clojure.lang.RT.loadClassForName(RT.java:2182)
at clojure.lang.RT.load(RT.java:436)
at clojure.lang.RT.load(RT.java:412)
at clojure.core$load$fn__5448.invoke(core.clj:5866)
at clojure.core$load.doInvoke(core.clj:5865)
at clojure.lang.RestFn.invoke(RestFn.java:408)
at clojure.lang.Var.invoke(Var.java:379)
at org.apache.storm.command.config_value.<clinit>(Unknown Source)
Caused by: java.lang.RuntimeException: java.io.IOException: Found multiple defaults.yaml resources. You're probably bundling the Storm jars with your topology jar. [jar:file:/home/tuzq/software/stormInstallPath/servers/apache-storm-1.1.0/lib/storm-core-1.1.0.jar!/defaults.yaml, jar:file:/home/tuzq/software/stormInstallPath/workdir/wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar!/defaults.yaml]
at org.apache.storm.utils.Utils.findAndReadConfigFile(Utils.java:383)
at org.apache.storm.utils.Utils.readDefaultConfig(Utils.java:427)
at org.apache.storm.utils.Utils.readStormConfig(Utils.java:463)
at org.apache.storm.utils.Utils.<clinit>(Utils.java:177)
... 39 more
Caused by: java.io.IOException: Found multiple defaults.yaml resources. You're probably bundling the Storm jars with your topology jar. [jar:file:/home/tuzq/software/stormInstallPath/servers/apache-storm-1.1.0/lib/storm-core-1.1.0.jar!/defaults.yaml, jar:file:/home/tuzq/software/stormInstallPath/workdir/wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar!/defaults.yaml]
at org.apache.storm.utils.Utils.getConfigFileInputStream(Utils.java:409)
at org.apache.storm.utils.Utils.findAndReadConfigFile(Utils.java:362)
... 42 more
注意,如果引入的storm-core的jar包要和集群中的jar包版本是一样
如果本地部署和集群部署的storm-core版本不一样,还需要修改代码中的包名结构,否则将会报错
接着执行如下:
接着执行下图的:
进入项目目录,比如我的:
进入target目录:
红框中的jar是带有其它jar包依赖的jar,上面一个jar是不带依赖的jar,集群模式运行的时候使用wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar来运行
将wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar上传到:/home/tuzq/software/stormInstallPath/workdir,执行以下命令:
[root@hadoop1 workdir]# storm jar wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar cn.toto.strom.wordcount.StormTopologyDriver
命令说明:
表示通过storm运行wordCountStromDemo-1.0-SNAPSHOT-jar-with-dependencies.jar中的cn.toto.strom.wordcount.StormTopologyDriver
运行效果:
通过UI界面查看一下程序在哪儿运行:浏览器地址是http://hadoop1:8080/
点击进入,查看效果:
查看最后结果打印位置
点击UI界面中的Blot2
进入日志目录,查看日志结果: