《数据结构与算法》课程设计报告：基于不同策略的英文单词词频统计与检索系统

1. 实验目的

本课程设计旨在通过实践验证数据结构与算法课程所学理论，加深对理论知识的理解，并通过实验提高实际动手能力。具体目标包括：

掌握数据分析方法：确立合适的数据结构，设计算法解决问题
培养程序设计能力：提高数据组织和使用的实践能力
培养数据结构思维：养成良好的程序开发习惯和技巧

2. 实验内容，核心类设计详解

实现一个英文单词词频统计系统，具体要求：

从文本文件(InFile.txt)读取英文文章，过滤标点符号
采用5种不同检索策略构建存储结构：
- 基于顺序表的顺序查找
- 基于链表的顺序查找
- 折半查找
- 二叉排序树查找
- 开放地址法哈希查找
实现功能：
- 词频统计：首次出现的单词词频设为1，已存在的单词词频+1
- 结果输出：按字典序写入不同文件(OutFile1-4.txt)
- 单词检索：输入单词查找频率，输出ASL和查找时间

2.1 word_count类 - 单词与词频存储

class word_count {
private:
    string word;
    int count;

public:
    // 构造函数
    word_count(const string& w = "") : word(w), count(1) {}
    
    // 获取单词
    string get() const { return word; }
    
    // 设置单词并重置计数
    void setcount(const string& w) {
        word = w;
        count = 1;
    }
    
    // 获取词频
    int getCount() const { return count; }
    
    // 增加词频
    void numadd() { count++; }
    
    // 重载比较运算符
    bool operator<(const word_count& other) const {
        return word < other.word;
    }
    
    bool operator==(const word_count& other) const {
        return word == other.word;
    }
};

2.2 BSTree类 - 二叉搜索树实现

class TreeNode {
private:
    word_count word;
    TreeNode* left;
    TreeNode* right;
    
public:
    TreeNode(const string& w) : word(w), left(nullptr), right(nullptr) {}
    friend class BSTree;
};

class BSTree {
private:
    TreeNode* root;
    int nodeCount;
    
    // 中序遍历辅助函数
    void inorderTraversal(TreeNode* node, string& result) const {
        if (node != nullptr) {
            inorderTraversal(node->left, result);
            result += node->word.get() + " : " + 
                     to_string(node->word.getCount()) + "\n";
            inorderTraversal(node->right, result);
        }
    }
    
    // 销毁树
    void destroyTree(TreeNode* node) {
        if (node != nullptr) {
            destroyTree(node->left);
            destroyTree(node->right);
            delete node;
        }
    }
    
public:
    BSTree() : root(nullptr), nodeCount(0) {}
    
    ~BSTree() {
        destroyTree(root);
    }
    
    // 插入单词
    int inword(const string& word) {
        if (root == nullptr) {
            root = new TreeNode(word);
            nodeCount++;
            return 1;
        }

        TreeNode* current = root;
        TreeNode* parent = nullptr;
        int comparisons = 0;

        while (current != nullptr) {
            comparisons++;
            if (current->word.get() == word) {
                current->word.numadd();
                return comparisons;
            }

            parent = current;
            if (word < current->word.get()) {
                current = current->left;
            } else {
                current = current->right;
            }
        }

        if (nodeCount < 5000) {
            TreeNode* newNode = new TreeNode(word);
            if (word < parent->word.get()) {
                parent->left = newNode;
            } else {
                parent->right = newNode;
            }
            nodeCount++;
            return comparisons;
        }

        return -1;
    }
    
    // 获取所有单词(按字典序)
    string getall() const {
        string result;
        inorderTraversal(root, result);
        return result;
    }
    
    // 查找单词
    int search(const string& word, int& comparisons) const {
        comparisons = 0;
        TreeNode* current = root;
        
        while (current != nullptr) {
            comparisons++;
            if (current->word.get() == word) {
                return current->word.getCount();
            }
            
            if (word < current->word.get()) {
                current = current->left;
            } else {
                current = current->right;
            }
        }
        
        return -1;
    }
};

2.3 HashTable类 - 开放地址法哈希表

class HashTable {
private:
    static const int TABLE_SIZE = 10007;  // 质数表大小
    vector<word_count> table;
    int count;
    
    // DJB2哈希函数
    int hash(const string& word) const {
        unsigned long hash = 5381;
        for (char c : word) {
            hash = ((hash << 5) + hash) + c;
        }
        return hash % TABLE_SIZE;
    }
    
    // 二次探测
    int quadraticProbe(int hash, int i) const {
        return (hash + i * i) % TABLE_SIZE;
    }

public:
    HashTable() : table(TABLE_SIZE), count(0) {}
    
    // 插入单词
    int inword(const string& word) {
        if (count >= TABLE_SIZE * 0.75) return -1;  // 负载因子超过0.75时不再插入

        int hashValue = hash(word);
        int i = 0;
        int pos;

        do {
            pos = quadraticProbe(hashValue, i);
            
            if (table[pos].get().empty()) {
                table[pos].setcount(word);
                count++;
                return i + 1;
            }
            
            if (table[pos].get() == word) {
                table[pos].numadd();
                return i + 1;
            }
            
            i++;
        } while (i < TABLE_SIZE);

        return -1;
    }
    
    // 获取所有单词
    string getall() const {
        stringstream result;
        for (const auto& item : table) {
            if (!item.get().empty()) {
                result << item.get() << " : " << item.getCount() << "\n";
            }
        }
        return result.str();
    }
    
    // 查找单词
    int search(const string& word, int& comparisons) const {
        comparisons = 0;
        int hashValue = hash(word);
        int i = 0;
        int pos;

        do {
            comparisons++;
            pos = quadraticProbe(hashValue, i);
            
            if (table[pos].get() == word) {
                return table[pos].getCount();
            }
            
            if (table[pos].get().empty()) {
                return -1;
            }
            
            i++;
        } while (i < TABLE_SIZE);

        return -1;
    }
};

3. 文件处理与工具类

3.1 FileUtils类 - 文件读写与单词处理

class FileUtils {
public:
    // 从文件读取单词
    static vector<string> readWords(const string& filename) {
        vector<string> words;
        ifstream file(filename);
        string word;
        
        if (!file.is_open()) {
            throw runtime_error("Unable to open file: " + filename);
        }
        
        while (file >> word) {
            word = processWord(word);
            if (!word.empty()) {
                words.push_back(word);
            }
        }
        
        file.close();
        return words;
    }
    
    // 将结果写入文件
    static bool writeResults(const string& filename, const string& content) {
        ofstream file(filename);
        if (!file.is_open()) {
            return false;
        }
        
        file << content;
        file.close();
        return true;
    }
    
    // 处理单词(过滤标点、转小写)
    static string processWord(const string& word) {
        string processed;
        for (char c : word) {
            if (isalpha(c)) {
                processed += tolower(c);
            }
        }
        return processed;
    }
    
    // 计时函数
    static double getExecutionTime(const function<void()>& func) {
        auto start = chrono::high_resolution_clock::now();
        func();
        auto end = chrono::high_resolution_clock::now();
        
        chrono::duration<double, milli> duration = end - start;
        return duration.count();
    }
};

4. 主程序与测试框架

4.1 主测试函数

void testBST() {
    cout << "\n=== Testing Binary Search Tree ===" << endl;
    BSTree tree;
    auto words = FileUtils::readWords("InFile.txt");
    
    // 插入所有单词
    for (const auto& word : words) {
        tree.inword(word);
    }
    
    // 写入结果文件
    FileUtils::writeResults("OutFile3.txt", tree.getall());
    
    // 交互式搜索
    string searchWord;
    cout << "Enter a word to search: ";
    cin >> searchWord;
    
    int comparisons = 0;
    double time = FileUtils::getExecutionTime([&]() {
        int freq = tree.search(searchWord, comparisons);
        if (freq != -1) {
            cout << "Word found with frequency: " << freq << endl;
        } else {
            cout << "Word not found" << endl;
        }
    });
    
    cout << "Comparisons: " << comparisons << endl;
    cout << "Time taken: " << time << " ms" << endl;
}

// 其他测试函数(testSequentialSearch, testBinarySearch, testHashTable等类似)

4.2 主菜单

int main() {
    try {
        int choice;
        do {
            cout << "\nWord Frequency Analysis" << endl;
            cout << "1. Sequential Search" << endl;
            cout << "2. Binary Search" << endl;
            cout << "3. Binary Search Tree" << endl;
            cout << "4. Hash Table" << endl;
            cout << "0. Exit" << endl;
            cout << "Enter your choice: ";
            cin >> choice;
            
            switch (choice) {
                case 1:
                    testSequentialSearch();
                    break;
                case 2:
                    testBinarySearch();
                    break;
                case 3:
                    testBST();
                    break;
                case 4:
                    testHashTable();
                    break;
                case 0:
                    cout << "Goodbye!" << endl;
                    break;
                default:
                    cout << "Invalid choice!" << endl;
            }
        } while (choice != 0);
        
    } catch (const exception& e) {
        cerr << "Error: " << e.what() << endl;
        return 1;
    }
    
    return 0;
}

5. 关键算法实现细节

5.1 堆排序实现(顺序表)

void shunxulist::heapSort() {
    // 建立最大堆
    for (int i = length / 2 - 1; i >= 0; i--)
        heapify(length, i);

    // 逐个从堆中取出元素
    for (int i = length - 1; i > 0; i--) {
        swap(data[0], data[i]);
        heapify(i, 0);
    }
}

void shunxulist::heapify(int n, int i) {
    int largest = i;
    int left = 2 * i + 1;
    int right = 2 * i + 2;

    if (left < n && data[left].get() > data[largest].get())
        largest = left;

    if (right < n && data[right].get() > data[largest].get())
        largest = right;

    if (largest != i) {
        swap(data[i], data[largest]);
        heapify(n, largest);
    }
}

5.2 归并排序实现(链表)

LinkNode* LinkList::mergeSort(LinkNode* head) {
    if (head == nullptr || head->next == nullptr)
        return head;
        
    LinkNode* middle = getMiddle(head);
    LinkNode* nextToMiddle = middle->next;
    middle->next = nullptr;
    
    LinkNode* left = mergeSort(head);
    LinkNode* right = mergeSort(nextToMiddle);
    
    return merge(left, right);
}

LinkNode* LinkList::merge(LinkNode* first, LinkNode* second) {
    if (first == nullptr)
        return second;
    if (second == nullptr)
        return first;
        
    LinkNode* result = nullptr;
    
    if (first->getWord().get() <= second->getWord().get()) {
        result = first;
        result->next = merge(first->next, second);
    } else {
        result = second;
        result->next = merge(first, second->next);
    }
    
    return result;
}

6. 性能分析与优化

6.1 时间复杂度比较

数据结构	插入操作	查找操作	排序操作
顺序表(顺序查找)	O(1)	O(n)	O(nlogn)
顺序表(二分查找)	O(n)	O(logn)	O(nlogn)
链表	O(1)	O(n)	O(nlogn)
二叉搜索树	O(logn)~O(n)	O(logn)~O(n)	无需排序
哈希表	O(1)	O(1)	O(nlogn)