JAVA--词频统计wordcount的实现

最新推荐文章于 2022-10-03 00:29:46 发布

原创最新推荐文章于 2022-10-03 00:29:46 发布 · 3.3k 阅读

9 ·

CC 4.0 BY-SA版权

JAVA整理专栏收录该内容

19 篇文章

订阅专栏

该博客介绍了如何使用Java实现词频统计功能，包括从文件中读取内容，以空格分隔英文词组，统计结果并存入MySQL和Redis数据库，存储时忽略大小写并按字典顺序排序。

词频统计需求：

1.要求统计出一个文件中的所有英文词组，以非英文字母为分隔符（这里以空格为例）。

2.要求统计结果在控制台输出，并将统计的结果存入mysql数据库和redis数据库中。

3.要求以Map键值对的方式进行存储，不区分大小写（可以先将内容全部转为大写，或者全部转为小写实现不区分大小写）

4.以字典的形式进行排序

词频统计：

package wordcount;

import redis.clients.jedis.Jedis;
import util.JDBCUnit;
import util.JedisPoolUtil;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

public class WordCount {
    public void displayWordCount(String fileName) {
        //以字符流的形式统计
        try {
            //读取文件
            FileReader fileReader = new FileReader(fileName);
            //使用流的方式读取内容
            BufferedReader reader = new BufferedReader(fileReader);
            //使用TreeMap，它会自动将结果按照字典的顺序排序
            TreeMap<String, Integer> tm = new TreeMap<String, Integer>();
            String readLine = null;
            while((readLine = reader.readLine()) != null){
                //将字母排序为小写
                readLine = readLine.toLowerCase();
                //过滤出只含有字母的字段
                String[] str = readLine.split("[\\s]+");
                //过滤掉所有的空格,“+”代表多个的意思。
                for (int i = 0; i < str.length; i++) {//循环统计出现次数
                    String word = str[i].trim();
                    if (tm.containsKey(word)) {
                        tm.put(word, tm.get(word) + 1);
                    } else {
                        tm.put(word, 1);
                    }
                }
            }

            //输出我们想要的字符串格式
            System.out.println("按字典序输出为：");
            Iterator<Map.Entry<String, Integer>> it = tm.entrySet().iterator();
            //使用迭代器取值
            while(it.hasNext()) {
                Map.Entry<String, Integer> entry = it.next();
                //开始插入redis数据库
                Jedis jedis = JedisPoolUtil.getJedis();
                jedis.hset("wordcount",entry.getKey(), String.valueOf(entry.getValue()));
                //开始插入mysql数据库
                Connection conn = JDBCUnit.getConnection();
                String sql = "INSERT INTO wordcount(words,counts) VALUES(?,?)";
                PreparedStatement pst = conn.prepareStatement(sql);
                pst.setString(1,entry.getKey());
                pst.setString(2, String.valueOf(entry.getValue()));
                pst.executeUpdate();
                //将结果输出到控制台
                System.out.println(entry.getKey() + "\t" + entry.getValue());
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        catch (SQLException e) {
            e.printStackTrace();
        }
    }
}

JDBC链接工具类：

package util;

import java.sql.*;

public class JDBCUnit {
    private static final String NAME = "root";
    private static final String PASSWORD = "zxc";
    private static final String URL = "jdbc:mysql://localhost:3306/maven-wordcount?useUnicode=true&characterEncoding=UTF-8";
    static{
        try {
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }

    /**
     * @return 返回数据库的连接对象
     */
    public static Connection getConnection() {
        Connection connection = null;
        try {
            connection = DriverManager.getConnection(URL, NAME, PASSWORD);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return connection;
    }

    /**
     * 关闭资源
     * @param connection 数据库连接对象
     * @param statement SQL的执行对象
     */
    public static void release(Connection connection, Statement statement) {
        try {
            if (statement != null) {
                statement.close();
            }
            if (connection != null) {
                connection.close();
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    /**
     * 关闭资源
     * @param connection 数据库连接对象
     * @param statement SQL的执行对象
     * @param resultSet 结果集的对象
     */
    public static void release(Connection connection, Statement statement, ResultSet resultSet) {
        try {
            if (resultSet != null) {
                resultSet.close();
            }
            if (statement != null) {
                statement.close();
            }
            if (connection != null) {
                connection.close();
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }
}

Jedis链接工具类：

package util;

import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;

import java.io.InputStream;
import java.util.Properties;

public class JedisPoolUtil {
//    private static String host = "192.168.59.150";
//    private static int port = 6379;
    private static JedisPool jedisPool = null;
    static {
        //读取配置文件：使用getClassLoader()类加载器的getResourceAsStream()获取流的方法，获取配置文件内容
        //每个类都有他的加载器，JedisPoolUtil.class只要 这里 是个类就可以，他就可以调用 类加载器 将文件上传到内存
        //使用类就是为了使用类加载器
        InputStream inputStream = JedisPoolUtil.class.getClassLoader().getResourceAsStream("redisDB.properties");
        //Properties专门读取.properties文件的类
        Properties properties = new Properties();
        try {
            properties.load(inputStream);
        } catch (Exception e) {
            System.out.println("配置文件读取失败！");
        }
        String host = properties.getProperty("redis.host");
        //Integer.parseInt()这个方法时将字符串读取为一个整数
        Integer Max_Total = Integer.parseInt(properties.getProperty("redis.MaxTotal"));
        Integer Max_Idle = Integer.parseInt(properties.getProperty("redis.MaxIdle"));
        Integer Min_Idle = Integer.parseInt(properties.getProperty("redis.MinIdle"));
        Integer port = Integer.parseInt(properties.getProperty("redis.port"));
        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
        //设置默认链接数
        jedisPoolConfig.setMaxTotal(Max_Total);
        //设置最大空闲数
        jedisPoolConfig.setMaxIdle(Max_Idle);
        //设置最小空闲数
        jedisPoolConfig.setMinIdle(Min_Idle);
        //获取链接池对象
        jedisPool = new JedisPool(jedisPoolConfig, host, port);
//        System.out.println(host+"\n"+port+"\n"+Max_Total+"\n"+Max_Idle+"\n"+Min_Idle);
    }

    public static Jedis getJedis() {
        //获取数据库操作对象
        return jedisPool.getResource();
    }

    public static void release(Jedis jedis) {
        if (jedis != null) {
            //释放资源
            jedis.close();
        }
    }
}

redisDB配置文件（properties文件）：

redis.port=6379
redis.host=192.168.59.150
redis.MaxTotal=10
redis.MaxIdle=8
redis.MinIdle=8

测试类：

package wordcount;

public class Main {
    public static void main(String[] args) {
        String line = "C:\\Users\\Super\\Desktop\\123.txt";
        //过滤掉首尾空字符串
        String fileName = line.trim();
        WordCount wc = new WordCount();
        wc.displayWordCount(fileName);
    }
}