【PTA数据结构 | C语言版】迷你搜索引擎_迷你搜索引擎c语言-优快云博客

本专栏持续输出数据结构题目集，欢迎订阅。

文章目录

- 题目
- 代码

题目

实现一种简单的搜索引擎功能，快速满足多达 10^5 条关键字查询请求。

输入格式：
输入首先给出正整数 n（≤ 100），为文件总数。随后按以下格式给出每个文件的内容：第一行给出文件的标题，随后给出不超过 100 行的文件正文，最后在一行中只给出一个字符 #，表示文件结束。每行不超过 50 个字符。在 n 个文件内容结束之后，给出查询总数 m（≤10^5），随后 m 行，每行给出不超过 10 个英文单词，其间以空格分隔，每个单词不超过 10 个英文字母，不区分大小写。

输出格式：
针对每一条查询，首先在一行中输出包含全部该查询单词的文件总数；如果总数为 0，则输出 Not Found。如果有找到符合条件的文件，则按输入的先后顺序输出这些文件，格式为：第1行输出文件标题；随后顺序输出包含查询单词的那些行内容。注意不能把相同的一行重复输出。

输入样例：

4
A00
Gold
silver truck
#
A01
Shipment of gold
damaged
in a fire
#
A02
Delivery
of silver
arrived in
a silver
truck
#
A03
Shipment of gold
arrived in
a truck
#
2
what ever
silver truck

输出样例：

0
Not Found
2
A00
silver truck
A02
of silver
a silver
truck

代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdbool.h>

#define MAX_FILES 1000               // 最大文件数
#define MAX_LINES_PER_FILE 1000      // 每个文件最大行数
#define MAX_LINE_LENGTH 510          // 每行最大长度
#define MAX_TITLE_LENGTH 510         // 标题最大长度
#define MAX_WORD_LENGTH 110          // 单词最大长度
#define MAX_WORDS_PER_QUERY 100      // 每个查询最大单词数
#define HASH_TABLE_SIZE 100007       // 哈希表大小（质数）

// 存储文件信息的结构
typedef struct {
    char title[MAX_TITLE_LENGTH];   // 文件标题
    char lines[MAX_LINES_PER_FILE][MAX_LINE_LENGTH];  // 文件内容行
    int line_count;                 // 有效行数
} FileInfo;

// 倒排索引项，记录单词在哪些文件的哪些行出现
typedef struct IndexNode {
    int file_id;                    // 文件ID
    int line_num;                   // 行号（从0开始）
    struct IndexNode* next;         // 下一个索引项
} IndexNode;

// 哈希表节点，存储单词及其索引链表
typedef struct HashNode {
    char word[MAX_WORD_LENGTH];     // 单词（小写）
    IndexNode* index_list;          // 索引链表
    struct HashNode* next;          // 处理哈希冲突的链表
} HashNode;

FileInfo files[MAX_FILES];          // 存储所有文件信息
int file_count = 0;                 // 实际文件数量
HashNode* hash_table[HASH_TABLE_SIZE] = {NULL};  // 哈希表

// 将字符转换为小写
char to_lower(char c) {
    if (c >= 'A' && c <= 'Z') {
        return c - 'A' + 'a';
    }
    return c;
}

// 计算字符串的哈希值
unsigned int hash(const char* str) {
    unsigned int hash_val = 0;
    while (*str) {
        hash_val = (hash_val << 5) + *str++;  // 简单哈希函数
    }
    return hash_val % HASH_TABLE_SIZE;
}

// 将单词插入哈希表，记录其出现的文件和行
void insert_word(const char* word, int file_id, int line_num) {
    if (strlen(word) == 0) return;
    
    // 创建小写版本的单词
    char lower_word[MAX_WORD_LENGTH];
    int len = 0;
    while (word[len] && len < MAX_WORD_LENGTH - 1) {
        lower_word[len] = to_lower(word[len]);
        len++;
    }
    lower_word[len] = '\0';
    
    // 计算哈希值
    unsigned int h = hash(lower_word);
    
    // 检查单词是否已在哈希表中
    HashNode* current_hash = hash_table[h];
    while (current_hash) {
        if (strcmp(current_hash->word, lower_word) == 0) {
            // 单词已存在，检查该文件的该行是否已记录
            IndexNode* current_index = current_hash->index_list;
            while (current_index) {
                if (current_index->file_id == file_id && 
                    current_index->line_num == line_num) {
                    return;  // 已记录，无需重复
                }
                current_index = current_index->next;
            }
            // 添加新的索引项
            IndexNode* new_index = (IndexNode*)malloc(sizeof(IndexNode));
            new_index->file_id = file_id;
            new_index->line_num = line_num;
            new_index->next = current_hash->index_list;
            current_hash->index_list = new_index;
            return;
        }
        current_hash = current_hash->next;
    }
    
    // 单词不存在，创建新的哈希节点
    HashNode* new_hash = (HashNode*)malloc(sizeof(HashNode));
    strcpy(new_hash->word, lower_word);
    new_hash->next = hash_table[h];
    hash_table[h] = new_hash;
    
    // 创建索引项
    IndexNode* new_index = (IndexNode*)malloc(sizeof(IndexNode));
    new_index->file_id = file_id;
    new_index->line_num = line_num;
    new_index->next = NULL;
    new_hash->index_list = new_index;
}

// 从一行文本中提取单词并添加到索引
void index_line(const char* line, int file_id, int line_num) {
    char word[MAX_WORD_LENGTH];
    int word_len = 0;
    
    for (int i = 0; line[i] != '\0'; i++) {
        if (isalpha(line[i])) {
            // 字母字符，添加到当前单词
            if (word_len < MAX_WORD_LENGTH - 1) {
                word[word_len++] = line[i];
            }
        } else {
            // 非字母字符，结束当前单词
            if (word_len > 0) {
                word[word_len] = '\0';
                insert_word(word, file_id, line_num);
                word_len = 0;
            }
        }
    }
    
    // 处理行尾的单词
    if (word_len > 0) {
        word[word_len] = '\0';
        insert_word(word, file_id, line_num);
    }
}

// 查找单词在哈希表中的索引链表
IndexNode* find_word(const char* word) {
    char lower_word[MAX_WORD_LENGTH];
    int len = 0;
    while (word[len] && len < MAX_WORD_LENGTH - 1) {
        lower_word[len] = to_lower(word[len]);
        len++;
    }
    lower_word[len] = '\0';
    
    unsigned int h = hash(lower_word);
    HashNode* current = hash_table[h];
    
    while (current) {
        if (strcmp(current->word, lower_word) == 0) {
            return current->index_list;
        }
        current = current->next;
    }
    
    return NULL;  // 未找到
}

// 解析查询字符串为单词数组
int parse_query(const char* query, char words[][MAX_WORD_LENGTH]) {
    int word_count = 0;
    int word_len = 0;
    
    for (int i = 0; query[i] != '\0' && word_count < MAX_WORDS_PER_QUERY; i++) {
        if (query[i] == ' ') {
            if (word_len > 0) {
                words[word_count][word_len] = '\0';
                word_count++;
                word_len = 0;
            }
        } else if (isalpha(query[i])) {
            if (word_len < MAX_WORD_LENGTH - 1) {
                words[word_count][word_len++] = query[i];
            }
        }
    }
    
    // 处理最后一个单词
    if (word_len > 0 && word_count < MAX_WORDS_PER_QUERY) {
        words[word_count][word_len] = '\0';
        word_count++;
    }
    
    return word_count;
}

// 查找包含所有查询单词的文件
int find_matching_files(char query_words[][MAX_WORD_LENGTH], int word_count, 
                       int* result_files, int* line_masks[]) {
    if (word_count == 0) return 0;
    
    // 为每个查询词找到对应的文件集
    IndexNode* index_lists[MAX_WORDS_PER_QUERY];
    for (int i = 0; i < word_count; i++) {
        index_lists[i] = find_word(query_words[i]);
        if (index_lists[i] == NULL) {
            // 有一个词不存在，直接返回0
            return 0;
        }
    }
    
    // 收集第一个词出现的所有文件
    bool in_first_set[MAX_FILES] = {false};
    IndexNode* current = index_lists[0];
    while (current) {
        if (current->file_id < file_count) {
            in_first_set[current->file_id] = true;
        }
        current = current->next;
    }
    
    // 找出所有词都出现的文件（交集）
    int match_count = 0;
    for (int file_id = 0; file_id < file_count; file_id++) {
        if (!in_first_set[file_id]) continue;
        
        // 检查该文件是否包含所有查询词
        bool all_match = true;
        for (int i = 1; i < word_count; i++) {
            bool found = false;
            current = index_lists[i];
            while (current) {
                if (current->file_id == file_id) {
                    found = true;
                    break;
                }
                current = current->next;
            }
            if (!found) {
                all_match = false;
                break;
            }
        }
        
        if (all_match) {
            // 记录匹配的文件ID
            result_files[match_count] = file_id;
            
            // 创建行掩码，记录包含至少一个查询词的行
            line_masks[match_count] = (int*)calloc(files[file_id].line_count, sizeof(int));
            for (int i = 0; i < word_count; i++) {
                current = index_lists[i];
                while (current) {
                    if (current->file_id == file_id && 
                        current->line_num < files[file_id].line_count) {
                        line_masks[match_count][current->line_num] = 1;
                    }
                    current = current->next;
                }
            }
            
            match_count++;
        }
    }
    
    return match_count;
}

int main() {
    // 读取文件
    int n;
    scanf("%d", &n);
    getchar();  // 消耗换行符
    
    for (int i = 0; i < n && i < MAX_FILES; i++) {
        // 读取标题
        if (fgets(files[i].title, MAX_TITLE_LENGTH, stdin) == NULL) break;
        // 去除标题末尾的换行符
        files[i].title[strcspn(files[i].title, "\n")] = '\0';
        
        // 读取文件内容行
        files[i].line_count = 0;
        char line[MAX_LINE_LENGTH];
        while (1) {
            if (fgets(line, MAX_LINE_LENGTH, stdin) == NULL) break;
            // 去除行末尾的换行符
            line[strcspn(line, "\n")] = '\0';
            
            // 检查是否是结束标记
            if (strcmp(line, "#") == 0) break;
            
            // 保存行内容
            if (files[i].line_count < MAX_LINES_PER_FILE) {
                strcpy(files[i].lines[files[i].line_count], line);
                // 索引此行
                index_line(line, i, files[i].line_count);
                files[i].line_count++;
            }
        }
        
        file_count++;
    }
    
    // 处理查询
    int m;
    scanf("%d", &m);
    getchar();  // 消耗换行符
    
    for (int i = 0; i < m; i++) {
        char query[MAX_WORDS_PER_QUERY * (MAX_WORD_LENGTH + 1)];
        if (fgets(query, sizeof(query), stdin) == NULL) break;
        // 去除查询末尾的换行符
        query[strcspn(query, "\n")] = '\0';
        
        // 解析查询为单词数组
        char query_words[MAX_WORDS_PER_QUERY][MAX_WORD_LENGTH];
        int word_count = parse_query(query, query_words);
        
        // 查找匹配的文件
        int result_files[MAX_FILES];
        int* line_masks[MAX_FILES] = {NULL};
        int match_count = find_matching_files(query_words, word_count, result_files, line_masks);
        
        // 输出结果
        printf("%d\n", match_count);
        if (match_count == 0) {
            printf("Not Found\n");
        } else {
            for (int j = 0; j < match_count; j++) {
                int file_id = result_files[j];
                // 输出文件标题
                printf("%s\n", files[file_id].title);
                
                // 输出包含查询词的行
                for (int line_num = 0; line_num < files[file_id].line_count; line_num++) {
                    if (line_masks[j][line_num]) {
                        printf("%s\n", files[file_id].lines[line_num]);
                    }
                }
                
                // 释放行掩码内存
                free(line_masks[j]);
            }
        }
    }
    
    // 释放哈希表内存（实际应用中需要，但OJ环境通常可以省略）
    for (int i = 0; i < HASH_TABLE_SIZE; i++) {
        HashNode* current_hash = hash_table[i];
        while (current_hash) {
            HashNode* temp_hash = current_hash;
            IndexNode* current_index = current_hash->index_list;
            while (current_index) {
                IndexNode* temp_index = current_index;
                current_index = current_index->next;
                free(temp_index);
            }
            current_hash = current_hash->next;
            free(temp_hash);
        }
    }
    
    return 0;
}