Multiple Files: Word Frequency

本文展示了如何从给定的博客内容中提取关键信息、生成关键词和标签,包括技术领域相关的关键词和细分技术领域的标签。

Directory and File:

data2(dir)
    111(dir)
        file1.txt(file)
        file2.txt(file)
    222(dir)
    file3.txt(file)
filter(dir)
    filter.txt(file)
result(dir)
    result.txt(file)

File contents:

file1.txt
Free Shipping  Mini Car Auto12v  Fresh Air Purifier Oxygen Bar
file2.txt
freeshipping new  Electromagnetic parking sensor no holes need to be drilled
file3.txt
DC 12V 1 to 3 Car Cigarette Lighter Socket Power Adapter Splitter with 1 USB Port  free shipping  #9622 [aaa bbb] ccc{ ddd}

Filter contents:

filter.txt
sensor            
bbb            
lighter              
auto12v              
usb             
oxygen             
ddd             
parking              
cigarette             
port 
1
free

Java Code:

WordProcess.java
/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 14:31 PM
 * :)~
 * MULTIPLE-FILES-WORD-FREQUENCY:WORD-FREQUENCY
 */
package com.algorithms.frequency.word;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class WordProcess {

    //private static final String DATA_FOLDER_PATH = "D://robot//TEMP//testData//data";
    //private static final String FILTER_WORDS_FILE_PATH = "D://robot//TEMP//testData//filter//filter.txt";
    //public static final String RESULTS_PATH = "D://robot//TEMP//testData//result//result.txt";
    private static final String DATA_FOLDER_PATH = "//Users//robot//TEMP//testData//data2";
    private static final String FILTER_WORDS_FILE_PATH = "//Users//robot//TEMP//testData//filter//filter.txt";
    public static final String RESULTS_FILE_PATH = "//Users//robot//TEMP//testData//result//result.txt";
    private static Map<String, Integer> dataHash = new HashMap<String, Integer>();
    private static Set<String> filterWordsSet = new HashSet<String>();

    public static void main(String[] args) throws Exception {
        loadFilterWords();
        FileProcess.readFolder(DATA_FOLDER_PATH);
        List<Map.Entry<String, Integer>> dataList = hashSort();
        FileProcess.writeFile(dataList);
    }

    public static void pruneFileText(String textPath) {
        String text = FileProcess.readFile(textPath).toLowerCase();// 将所有字母化为小写
        //注意: '['与']'前加\\
        text = text.replaceAll("[`~!@#$%^&*()+=|{}':;',//\\[//\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]|\\s+|\t|\r", " ");
        String words[] = text.split("\\s+");// 取出单词,并将单词存入数组中
        setFrequency(words);
    }

    public static void setFrequency(String[] words) {
        for (int i = 0; i < words.length; i++) {
            String key = words[i]; // key对应单词
            if ((dataHash.get(key) != null) && (!filterWordsSet.contains(key))) {
                // value对应单词出现的频率,单词已在map中存在且不属于过滤单词,则value+1
                int value = ((Integer) dataHash.get(key)).intValue();
                value++;
                dataHash.put(key, new Integer(value));
            } else if((dataHash.get(key) == null) && (!filterWordsSet.contains(key))){
                // 单词未在map中存在且不属于过滤单词,则value初始化为1
                dataHash.put(key, new Integer(1));
            }
        }
    }

    public static List<Map.Entry<String, Integer>> hashSort() {
        List<Map.Entry<String, Integer>> list_Data = new ArrayList<Map.Entry<String, Integer>>(dataHash.entrySet());
        Collections.sort(list_Data, new Comparator<Map.Entry<String, Integer>>() {
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                if (o2.getValue() != null && o1.getValue() != null && o2.getValue().compareTo(o1.getValue()) > 0) {
                    return 1;
                } else {
                    return -1;
                }
            }
        });
        return list_Data;
    }

    public static void loadFilterWords() {
        String filterWordsText = FileProcess.readFile(FILTER_WORDS_FILE_PATH);
        String words[] = filterWordsText.split("\\s+|\\t|\\r|\\n");// 取出单词,并将单词存入数组中
        System.out.println("Number of filter words: "+words.length);
        for(String word : words){
            filterWordsSet.add(word);
        }
    }
}
FileProcess.java
/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 14:31 PM
 * :)~
 * MULTIPLE-FILES-WORD-FREQUENCY:WORD-FREQUENCY
 */
package com.algorithms.frequency.word;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Date;
import java.text.SimpleDateFormat;

public class FileProcess {

    /**
     * read all files in folder
     * @param path
     */
    public static void readFolder(String path) {
        int fileNum = 0, folderNum = 0;
        File file = new File(path);
        if (file.exists()) {
            LinkedList<File> dirList = new LinkedList<File>();
            File[] files = file.listFiles();
            for (File singleFile : files) {
                if (singleFile.isDirectory()) {
                    dirList.add(singleFile);
                    folderNum++;
                } else {
                    System.out.println("FILE: " + singleFile.getAbsolutePath());
                    WordProcess.pruneFileText(singleFile.getAbsolutePath());
                    fileNum++;
                }
            }
            File temp_file;
            while (!dirList.isEmpty()) {
                temp_file = dirList.removeFirst();
                files = temp_file.listFiles();
                for (File singleFile : files) {
                    if (singleFile.isDirectory()) {
                        dirList.add(singleFile);
                        folderNum++;
                    } else {
                        System.out.println("FILE: " + singleFile.getAbsolutePath());
                        fileNum++;
                        WordProcess.pruneFileText(singleFile.getAbsolutePath());
                    }
                }
            }
        } else {
            System.out.println("The path of DATA FOLDER is not exist!");
        }
        System.out.println("Number of dir is: " + folderNum + "\nNumber of file is: "+ fileNum);
    }

    /**
     * read content from filePath and return content
     * @param filePath
     */
    public static String readFile(String filePath) {
        File file = new File(filePath);
        StringBuffer result = new StringBuffer();
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            while ((tempString = reader.readLine()) != null) {
                result.append(" ");
                result.append(tempString);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return result.toString();
    }

    /**
     * write content into resultFilePath
     * @param dataList
     */
    public static void writeFile(List<Map.Entry<String, Integer>> dataList) {
        System.out.println("Start: write word and frequency");
        int size = dataList.size();
        File file = null;
        FileWriter fileWrite = null;
        PrintWriter pw = null;
        int number = 1;
        try {
            // if file exist ,override ; if not, create
            file = new File(WordProcess.RESULTS_FILE_PATH);
            fileWrite = new FileWriter(file, false);//true为追加append,false为覆盖override
            pw = new PrintWriter(fileWrite);
            SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式
            System.out.println("========================"+df.format(new Date())+"=========================");// new Date()为获取当前系统时间
            pw.println("========================" + df.format(new Date()) + "=========================");

            pw.println(String.format("%-15s", "word number") + String.format("%-30s", "word") + String.format("%-15s","frequency"));
            for (int i = 0; i < size; i++) {
                String word = dataList.get(i).getKey();
                int frequency = dataList.get(i).getValue();

                pw.print(String.format("%-15d", number++));
                pw.print(String.format("%-30s", word));
                pw.print(String.format("%-15d", frequency));
                pw.println();
            }
            pw.flush();
            fileWrite.flush();
        } catch(IOException e) {
            e.printStackTrace();
        }finally{
            try {
                pw.close();
                fileWrite.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            System.out.println("End: write word and frequency");
        }
    }
}

Outputs:

Number of filter words: 13
FILE: /Users/robot/TEMP/testData/data2/file3.txt
FILE: /Users/robot/TEMP/testData/data2/111/file1.txt
FILE: /Users/robot/TEMP/testData/data2/111/file2.txt
Number of dir is: 2
Number of file is: 3
Start: write word and frequency
========================2015-04-01 16:05:19=========================
End: write word and frequency
outputs in result.txt
========================2015-04-01 16:05:19=========================
word number    word                          frequency      
1              shipping                      2              
2              to                            2              
3              car                           2              
4              freeshipping                  1              
5              bar                           1              
6              9622                          1              
7              air                           1              
8              new                           1              
9              3                             1              
10             splitter                      1              
11             purifier                      1              
12             adapter                       1              
13             12v                           1              
14             aaa                           1              
15             with                          1              
16             dc                            1              
17             be                            1              
18             holes                         1              
19             mini                          1              
20             ccc                           1              
21             socket                        1              
22             power                         1              
23             drilled                       1              
24             electromagnetic               1              
25             fresh                         1              
26             no                            1              
27             need                          1  

Add word filter: shipping, no, need in filter.txt

filter.txt
sensor            
bbb            
lighter              
auto12v              
usb             
oxygen             
ddd             
parking              
cigarette             
port 
1
free

shipping
no
need
then, the outputs changed to:
Number of filter words: 16
FILE: /Users/robot/TEMP/testData/data2/file3.txt
FILE: /Users/robot/TEMP/testData/data2/111/file1.txt
FILE: /Users/robot/TEMP/testData/data2/111/file2.txt
Number of dir is: 2
Number of file is: 3
Start: write word and frequency
========================2015-04-01 16:21:30=========================
End: write word and frequency
outputs in result.txt
========================2015-04-01 16:21:30=========================
word number    word                          frequency      
1              car                           2              
2              to                            2              
3              freeshipping                  1              
4              be                            1              
5              mini                          1              
6              holes                         1              
7              bar                           1              
8              9622                          1              
9              ccc                           1              
10             air                           1              
11             power                         1              
12             socket                        1              
13             drilled                       1              
14             fresh                         1              
15             electromagnetic               1              
16             new                           1              
17             3                             1              
18             splitter                      1              
19             adapter                       1              
20             purifier                      1              
21             12v                           1              
22             aaa                           1              
23             with                          1              
24             dc                            1              
In this task, you will design and implement a user-driven vocabulary management system that supports interaction from different types of users (e.g., readers and admins). This task simulates a real-world collaborative environment, where access control and data integrity are critical. 🧩 Task Objective You are required to implement the following two classes: Role: defines the role of a user, including their name, access level, and identity information. RoleBasedVocabSys: manages user login, menu display, command execution, and interaction with the TextProcessor object. The program should simulate a terminal-like experience for different types of users, controlling what they can see and what actions they can perform. 👥 User Roles The system supports two user roles: Reader: Can log in and view the vocabulary. Can view the top 10 and bottom 10 most frequent words. Cannot update or modify any part of the vocabulary. check the examples below for reference. Admin: Has all the permissions of the reader. Can update the vocabulary by adding new files or removing existing ones. Has full access to the vocabulary update methods from TextProcessor. User credentials and access roles are provided as the varaiable users_info from the util.py module. in scaffold. 📋 Task Requirements You must: Implement the Role class, which should: Store and return the user’s username, display name, and access level (e.g., "reader", "admin"). Provide getter methods: get_user_name(), get_access(), get_name(). Implement the RoleBasedVocabSys class, which should: Handle login and logout. Display different menus depending on whether a user is logged in and their access level. Call TextProcessor functions (from Task 7) to manage the vocabulary. Enforce role-based access control (e.g., only admins can update vocabularies). Use the provided attributes and method names in the scaffold. Do not rename or remove any predefined code blocks. Implement menu-based navigation where users can choose options via standard input: Exit the system. Login or Logout. View the top 10 or bottom 10 frequent words. Update vocabulary by adding/removing files (admin only). You may write additional helper functions or methods as needed. 🧠 Additional Notes The vocabulary is loaded and managed via the TextProcessor object created in the constructor. The files to be added/removed are fixed as data/for_admin/excluded.csv data/add.csv and data/delete.csv for this exercise, but you may generalize it in future tasks. All user input should be validated using verify_user_choice. The system should loop until the user chooses to exit. Examples Example 1a: interface when starting the program Welcome to the Mark system v0.0! Please Login: 1.Exit 2.Login Enter your choice: Example 1b: unlimited attemps for invalid users inputs Welcome to the Mark system v0.0! Please Login: 1.Exit 2.Login Enter your choice: ewfwef Enter your choice: edf Enter your choice: 3 Enter your choice: 4 ... Enter your choice: Example 2a: correct login credential for reader Welcome to the Mark system v0.0! Please Login: 1.Exit 2.Login Please key your account name: Jueqing Please key your password: jueqing123 Welcome Jueqing Lu Please choose one option below: 1.Exit 2.Logout/Re-Login 3.Show top 10 frequency vocabularies 4.Show last 10 frequency vocabularies Enter your choice: Example 2b: correct login credential for admin Welcome to the Mark system v0.0! Please Login: 1.Exit 2.Login Please key your account name: Trang Please key your password: trang123 Welcome Trang Vu Please choose one option below: 1.Exit 2.Logout/Re-Login 3.Show top 10 frequency vocabularies 4.Show last 10 frequency vocabularies 5.Updating Vobulary for adding 6.Updating Vobulary for excluding Enter your choice:
09-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值