KMP算法

原创已于 2025-03-30 14:04:14 修改 · 154 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#学习 #算法

于 2025-03-16 00:11:10 首次发布

算法专栏收录该内容

16 篇文章

订阅专栏

1 前言

很久之前，大概毕业一两年的时候，当时对写算法有点兴趣，于是自己写了一个strstr，用C写的，想破头也没什么特别的算法，最后也差不多是遍历，找到第一个字符后依次匹配。写了两天差不多完工，但是效率比当时MFC自带的函数查了非常远，大概好几倍吧。

所以后面也放弃了这种自己写库函数的想法了。

最近看leetcode，看到KMP算法，所以也想学一下。找了一个普通的txt文件，大概是2M多。找寻里面的NFC关键字。运行设备是树莓派5（4GB版本）。

2 grep

tom@raspberrypi:~/test/strstr $ time grep NFC t.txt
#      source; NFC; NFD; NFKC; NFKD
#    NFC
#      c2 ==  toNFC(c1) ==  toNFC(c2) ==  toNFC(c3)
#      c4 ==  toNFC(c4) ==  toNFC(c5)
#      X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.

real    0m0.005s
user    0m0.005s
sys     0m0.000s

使用最常见的grep，耗时大概是5ms。

3 常规的遍历

代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *my_strstr(const char *haystack, const char *needle) {
    if (!*needle) return (char *)haystack;  // 如果 needle 为空，返回 haystack
    
    int count = 0;
    while (*haystack) {
        const char *h = haystack;
        const char *n = needle;
        
        while (*h && *n && *h == *n) {
            h++;
            n++;
        }
        
        if (!*n) return (char *)haystack;  // 找到匹配的子字符串，返回起始位置
        
        haystack++;  // 继续搜索
    }
    
    return NULL;  // 没有找到，返回 NULL
}

int count_occurrences(const char *text, const char *word) {
    int count = 0;
    const char *ptr = text;
    while ((ptr = my_strstr(ptr, word)) != NULL) {
        count++;
        ptr++; // 继续搜索下一个匹配项
    }
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Error opening file");
        return 1;
    }

    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    rewind(file);

    char *buffer = (char *)malloc(file_size + 1);
    if (!buffer) {
        perror("Memory allocation failed");
        fclose(file);
        return 1;
    }

    fread(buffer, 1, file_size, file);
    buffer[file_size] = '\0';
    fclose(file);

    int occurrences = count_occurrences(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    free(buffer);
    return 0;
}

耗时如下：

tom@raspberrypi:~/test/strstr $ time ./a.out t.txt NFC
Occurrences of 'NFC': 9

real    0m0.022s
user    0m0.022s
sys     0m0.000s

大概是22毫秒。比grep慢了4倍多，还没有显示。

4 KMP算法

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// 计算 KMP 前缀表
void computeLPSArray(const char *pattern, int M, int *lps) {
    int length = 0;
    lps[0] = 0;
    int i = 1;

    while (i < M) {
        if (pattern[i] == pattern[length]) {
            length++;
            lps[i] = length;
            i++;
        } else {
            if (length != 0) {
                length = lps[length - 1];
            } else {
                lps[i] = 0;
                i++;
            }
        }
    }
}

// 使用 KMP 进行字符串匹配
int KMP_search(const char *text, const char *pattern) {
    int N = strlen(text);
    int M = strlen(pattern);
    if (M == 0) return 0;

    int *lps = (int *)malloc(M * sizeof(int));
    if (!lps) {
        perror("Memory allocation failed");
        return -1;
    }
    
    computeLPSArray(pattern, M, lps);
    
    int i = 0, j = 0, count = 0;
    while (i < N) {
        if (pattern[j] == text[i]) {
            i++;
            j++;
        }
        if (j == M) {
            count++;
            j = lps[j - 1];
        } else if (i < N && pattern[j] != text[i]) {
            if (j != 0) {
                j = lps[j - 1];
            } else {
                i++;
            }
        }
    }
    
    free(lps);
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Error opening file");
        return 1;
    }

    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    rewind(file);

    char *buffer = (char *)malloc(file_size + 1);
    if (!buffer) {
        perror("Memory allocation failed");
        fclose(file);
        return 1;
    }

    fread(buffer, 1, file_size, file);
    buffer[file_size] = '\0';
    fclose(file);

    int occurrences = KMP_search(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    free(buffer);
    return 0;
}

运行结果：

tom@raspberrypi:~/test/strstr $ time ./str2 t.txt NFC
Occurrences of 'NFC': 9

real    0m0.023s
user    0m0.019s
sys     0m0.004s

好吧。。。效率并没有提升，反而还慢了1ms。。。

换了长一些的匹配字依旧。。。

tom@raspberrypi:~/test/strstr $ time ./str1 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.023s
user    0m0.019s
sys     0m0.004s
tom@raspberrypi:~/test/strstr $ time ./str2 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.024s
user    0m0.020s
sys     0m0.004s

5 glib的strstr

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

int count_occurrences(const char *text, const char *pattern) {
    int count = 0;
    const char *pos = text;
    while ((pos = strstr(pos, pattern)) != NULL) {
        count++;
        pos++; // Move forward to avoid infinite loop
    }
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    int fd = open(filename, O_RDONLY);
    if (fd == -1) {
        perror("Error opening file");
        return 1;
    }

    struct stat sb;
    if (fstat(fd, &sb) == -1) {
        perror("Error getting file size");
        close(fd);
        return 1;
    }

    char *buffer = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (buffer == MAP_FAILED) {
        perror("Error mapping file");
        close(fd);
        return 1;
    }
    close(fd);

    int occurrences = count_occurrences(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    munmap(buffer, sb.st_size);
    return 0;
}

运行结果：

tom@raspberrypi:~/test/strstr $ time ./str3 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.002s
user    0m0.000s
sys     0m0.002s

好吧，比自己写的快了差不多10倍。。。

tom@raspberrypi:~/test/strstr $ time grep "CJK COMPATIBILITY IDEOGRAPH" t.txt -c
1002

real    0m0.003s
user    0m0.003s
sys     0m0.000s

比grep也快了差不多1ms。。。

6 后记

	暴力str	KMP	grep	strstr
耗时	22ms	23ms	3ms	2ms

本来还想详细学学KMP，通过横评可以看出，效率的关键还真不在于算法。不过grep和strstr都是开源的，回头有时间再学习学习。。。

有时候真的要感叹，刷题是一回事，工程实践又是一回事。。。