小厨上次写过一次利用Storm从hdfs读取数据,但是效果非常不好,例如只能读取一次文件,无法控制读文件的次数。如下图所示:::
因此,在这里,小厨利用第二种方法读取hdfs数据。
需求:1、可以连续的读整个文件夹下面的内容;2、读完最后一个文件之后,退出Topo
整个代码1:编写程序入口
package com.bigdata.storm;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.thrift.TException;
import org.apache.storm.topology.TopologyBuilder;
/**
* 描述一个topology对象 并向storm集群提交
* @author xiaxing
*
*/
public class TopoSubmitterClient {
public static void main(String[] args) throws TException, InvalidTopologyException, AuthorizationException {
//先获得一个Topology的构建器
TopologyBuilder builder = new TopologyBuilder();
//指定topo所有的spout组件类
//参数1 spout的id 参数2 spout的实例对象
builder.setSpout("hdfs-spout", new GetHdfsBolt());
//指定topo所用的第一个bolt组件,同时指定本bolt的消息流是从哪个组件流过来的
builder.setBolt("countBolt", new CountBolt()).shuffleGrouping("hdfs-spout");
//使用builder来生成一个Topology对象
StormTopology phoneTopo = builder.createTopology();
//将phoneTopo提交给集群运行
Config config = new Config();
//指定storm集群为Topology分配6个worker执行
config.setNumWorkers(6);
if(args.length > 0) {
//集群运行
StormSubmitter.submitTopology("phonecount-topo", config, phoneTopo);
}else {
//storm支持本地模拟测试
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("phonecount-topo", config, phoneTopo);
}
}
}
整个代码2: 设置数据源,读取hdfs文件:GetHdfsBolt
package com.bigdata.getfromhdfs;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.Utils;
/**
* 自定义方法读取hdfs的文件
* @author xiaxing
*
*/
public class GetHdfsSpout extends BaseRichSpout {
private static final long serialVersionUID = 1L;
//声明一个成员变量 将open里面的SpoutOutputCollector对象的发送方法 能传递给nextTuple()方法使用
private SpoutOutputCollector collector ;
private Map conf;
private TopologyContext context;
public String filename;
//此处的文件位置要对应 之前 写入hdfsBolt设置的位置
private String filePath = "/phoneStorm";
public void nextTuple() {
readHdfsData(filePath);
}
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
this.collector = collector;
this.conf = conf;
this.context = context;
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("line-word"));
}
private void readHdfsData(String FilePath){
System.out.println("开始拿数据");
//创建Configuration对象
Configuration conf=new Configuration();
conf.set("fs.defaultFS","hdfs://192.168.83.131:9000");
//遍历目录下的所有文件
BufferedReader in = null;
FSDataInputStream dis;
String line;
try {
//创建FileSystem对象
FileSystem hdfs = FileSystem.get(conf);
FileStatus[] status = hdfs.listStatus(new Path(FilePath));
for(int i=0; i<status.length; i++) {
System.out.println("status的长度为:"+status.length);
if(status[i].isFile()) {
//此处属于测试代码,为了区分第几次读数据
System.out.println(">>>>>"+status[i].getPath()
+" , length: "+status[i].getLen()
+" , dir owner:"+status[i].getOwner());
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
System.out.println("*****************************");
Utils.sleep(5000);
}
dis = hdfs.open(status[i].getPath());//输入流,获取某个文件的路径
in = new BufferedReader(new InputStreamReader(dis, "UTF-8"));
while ((line = in.readLine()) != null){
System.out.println("拿到的数据为"+line);
this.collector.emit(new Values(line));
Utils.sleep(100);
}
if(i == status.length-1) {
System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
System.out.println("**********已经读完"+filePath+" 下面的"+status.length+"个文件,是时候结束整个Topo了************");
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
System.exit(0);;
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
in.close();
}catch(IOException e) {
e.printStackTrace();
}
}
}
}
整合代码3:统计对应手机号出现的次数:CountBolt
package com.readhdfs;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
* 最后一级bolt 进行统计计数
*/
public class CountBolt extends BaseRichBolt {
private Map<String, Integer> map = new HashMap<>();
private OutputCollector collector;
@Override
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.collector = collector;
}
@Override
public void execute(Tuple tuple) {
String word = tuple.getStringByField("line-word");
Integer num = map.get(word);
if(num == null){
num = 1;
}else {
num++;
}
map.put(word,num);
System.out.println("**************************");
Set<Map.Entry<String, Integer>> entries = map.entrySet();
for(Map.Entry<String, Integer> entry:entries){
System.out.println(entry.getKey()+"出现的次数为:"+entry.getValue());
}
collector.emit(new Values(word, num));
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("word", "num"));
}
}
至此,代码逻辑编写结束。回头想一想,其实这种可控方法比原生的要好一些。多思考,收获颇丰。