模拟Storm词频统计程序

本文介绍了一种使用Apache Storm进行实时词频统计的方法。通过自定义Spout读取本地文件,Bolt进行单词切割和计数,实现了对大量文本数据的实时处理。此方案适用于实时分析大量流式数据,如社交媒体监控、日志分析等场景。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package storm;

import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class MyStorm {
    private Random random = new Random();

    private BlockingQueue sentenceQueue = new ArrayBlockingQueue(50000);
    private BlockingQueue wordQueue = new ArrayBlockingQueue(50000);
    // 用来保存最后计算的结果key=单词,value=单词个数
    Map<String, Integer> counters = new HashMap<String, Integer>();

    //用来发送句子
    public void nextTuple() {
        String[] sentences = new String[]{"the cow jumped over the moon",
                "an apple a day keeps the doctor away",
                "four score and seven years ago",
                "snow white and the seven dwarfs", "i am at two with nature"};
        String sentence = sentences[random.nextInt(sentences.length)];
        try {
            sentenceQueue.put(sentence);
            System.out.println("send sentence:" + sentence);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    //用来切割句子
    public void split(String sentence) {
        System.out.println("resv sentence" + sentence);
        String[] words = sentence.split(" ");
        for (String word : words) {
            word = word.trim();
            if (!word.isEmpty()) {
                word = word.toLowerCase();
                //collector.emit()
                wordQueue.add(word);
                System.out.println("split word:" + word);
            }
        }

    }

    //用来计算单词
    public void wordcounter(String word) {
        if (!counters.containsKey(word)) {
            counters.put(word, 1);
        } else {
            Integer c = counters.get(word) + 1;
            counters.put(word, c);
        }
        System.out.println("print map:" + counters);
    }


    public static void main(String[] args) {
        //线程池
        ExecutorService executorService = Executors.newFixedThreadPool(10);
        MyStorm myStorm = new MyStorm();
        //发送句子到sentenceQueue
        executorService.submit(new MySpout(myStorm));
        //接收一个句子,并将句子切割
        executorService.submit(new MyBoltSplit(myStorm));
        //接收一个单词,并进行计算
        executorService.submit(new MyBoltWordCount(myStorm));
    }

    public BlockingQueue getSentenceQueue() {
        return sentenceQueue;
    }

    public void setSentenceQueue(BlockingQueue sentenceQueue) {
        this.sentenceQueue = sentenceQueue;
    }

    public BlockingQueue getWordQueue() {
        return wordQueue;
    }

    public void setWordQueue(BlockingQueue wordQueue) {
        this.wordQueue = wordQueue;
    }
}

class MySpout extends Thread {

    private MyStorm myStorm;

    public MySpout(MyStorm myStorm) {
        this.myStorm = myStorm;
    }

    @Override
    public void run() {
        //storm框架在循环调用spout的netxTuple方法
        while (true) {
            myStorm.nextTuple();
            try {
                this.sleep(100);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}

class MyBoltWordCount extends Thread {

    private MyStorm myStorm;

    @Override
    public void run() {
        while (true) {
            try {
                String word = (String) myStorm.getWordQueue().take();
                myStorm.wordcounter(word);
            } catch (Exception e) {
                System.out.println(e);
            }
        }
    }

    public MyBoltWordCount(MyStorm myStorm) {
        this.myStorm = myStorm;
    }
}

class MyBoltSplit extends Thread {

    private MyStorm myStorm;

    @Override
    public void run() {
        while (true) {
            try {
                String sentence = (String) myStorm.getSentenceQueue().take();
                myStorm.split(sentence);
            } catch (Exception e) {
                System.out.println(e);
            }
        }
    }

    public MyBoltSplit(MyStorm myStorm) {
        this.myStorm = myStorm;
    }
}

package com.ljq.bigdata;

import org.apache.commons.io.FileUtils;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * @author :ljq
 * @date :Created in 2019/12/24 15:15
 * @description :使用Storm完成词频统计功能
 * @creed :Talk is cheap,show me the code
 */
public class LocalWordCountStormTopology {
    public static class DataSourceSpout extends BaseRichSpout {
        private SpoutOutputCollector collector;

        public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
            this.collector = collector;
        }

        public void nextTuple() {
            Collection<File> files = FileUtils.listFiles(new File("C:\\Users\\asus\\Desktop\\test"), new String[]{"txt"}, true);
            for (File file : files) {
                try {
                    List<String> lines = FileUtils.readLines(file);
                    for (String line : lines) {
                        this.collector.emit(new Values(line));
                    }
                    FileUtils.moveFile(file, new File(file.getAbsolutePath() + System.currentTimeMillis()));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            declarer.declare(new Fields("line"));
        }
    }

    public static class SplitBolt extends BaseRichBolt {

        private OutputCollector collector;

        public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
            this.collector = collector;
        }

        public void execute(Tuple input) {
            String line = input.getStringByField("line");
            String[] words = line.split(" ");
            for (String word : words) {
                this.collector.emit(new Values(word));
            }
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {
            declarer.declare(new Fields("word"));
        }
    }

    public static class CountBolt extends BaseRichBolt {

        public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        }

        Map<String, Integer> map = new HashMap<String, Integer>();

        public void execute(Tuple input) {
            String word = input.getStringByField("word");
            Integer count = map.get(word);
            if (count == null) {
                count = 0;
            }
            count++;
            map.put(word, count);
            Set<Map.Entry<String, Integer>> entrySet = map.entrySet();
            for (Map.Entry<String, Integer> entry : entrySet) {
                System.out.println(entry);
            }
        }

        public void declareOutputFields(OutputFieldsDeclarer declarer) {
        }
    }

    public static void main(String[] args) {
        TopologyBuilder builder = new TopologyBuilder();
        builder.setSpout("DataSourceSpout", new DataSourceSpout());
        builder.setBolt("SplitBolt", new SplitBolt()).shuffleGrouping("DataSourceSpout");
        builder.setBolt("CountBolt", new CountBolt()).shuffleGrouping("SplitBolt");
        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology("LocalWordCountStormTopology", new Config(), builder.createTopology());
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值