KMP算法

1 前言

很久之前,大概毕业一两年的时候,当时对写算法有点兴趣,于是自己写了一个strstr,用C写的,想破头也没什么特别的算法,最后也差不多是遍历,找到第一个字符后依次匹配。写了两天差不多完工,但是效率比当时MFC自带的函数查了非常远,大概好几倍吧。

所以后面也放弃了这种自己写库函数的想法了。

最近看leetcode,看到KMP算法,所以也想学一下。找了一个普通的txt文件,大概是2M多。找寻里面的NFC关键字。运行设备是树莓派5(4GB版本)。

2 grep

tom@raspberrypi:~/test/strstr $ time grep NFC t.txt
#      source; NFC; NFD; NFKC; NFKD
#    NFC
#      c2 ==  toNFC(c1) ==  toNFC(c2) ==  toNFC(c3)
#      c4 ==  toNFC(c4) ==  toNFC(c5)
#      X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.

real    0m0.005s
user    0m0.005s
sys     0m0.000s

使用最常见的grep,耗时大概是5ms。

3 常规的遍历

代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *my_strstr(const char *haystack, const char *needle) {
    if (!*needle) return (char *)haystack;  // 如果 needle 为空,返回 haystack
    
    int count = 0;
    while (*haystack) {
        const char *h = haystack;
        const char *n = needle;
        
        while (*h && *n && *h == *n) {
            h++;
            n++;
        }
        
        if (!*n) return (char *)haystack;  // 找到匹配的子字符串,返回起始位置
        
        haystack++;  // 继续搜索
    }
    
    return NULL;  // 没有找到,返回 NULL
}

int count_occurrences(const char *text, const char *word) {
    int count = 0;
    const char *ptr = text;
    while ((ptr = my_strstr(ptr, word)) != NULL) {
        count++;
        ptr++; // 继续搜索下一个匹配项
    }
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Error opening file");
        return 1;
    }

    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    rewind(file);

    char *buffer = (char *)malloc(file_size + 1);
    if (!buffer) {
        perror("Memory allocation failed");
        fclose(file);
        return 1;
    }

    fread(buffer, 1, file_size, file);
    buffer[file_size] = '\0';
    fclose(file);

    int occurrences = count_occurrences(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    free(buffer);
    return 0;
}

耗时如下:

tom@raspberrypi:~/test/strstr $ time ./a.out t.txt NFC
Occurrences of 'NFC': 9

real    0m0.022s
user    0m0.022s
sys     0m0.000s

大概是22毫秒。比grep慢了4倍多,还没有显示。

4 KMP算法

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// 计算 KMP 前缀表
void computeLPSArray(const char *pattern, int M, int *lps) {
    int length = 0;
    lps[0] = 0;
    int i = 1;

    while (i < M) {
        if (pattern[i] == pattern[length]) {
            length++;
            lps[i] = length;
            i++;
        } else {
            if (length != 0) {
                length = lps[length - 1];
            } else {
                lps[i] = 0;
                i++;
            }
        }
    }
}

// 使用 KMP 进行字符串匹配
int KMP_search(const char *text, const char *pattern) {
    int N = strlen(text);
    int M = strlen(pattern);
    if (M == 0) return 0;

    int *lps = (int *)malloc(M * sizeof(int));
    if (!lps) {
        perror("Memory allocation failed");
        return -1;
    }
    
    computeLPSArray(pattern, M, lps);
    
    int i = 0, j = 0, count = 0;
    while (i < N) {
        if (pattern[j] == text[i]) {
            i++;
            j++;
        }
        if (j == M) {
            count++;
            j = lps[j - 1];
        } else if (i < N && pattern[j] != text[i]) {
            if (j != 0) {
                j = lps[j - 1];
            } else {
                i++;
            }
        }
    }
    
    free(lps);
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    FILE *file = fopen(filename, "r");
    if (!file) {
        perror("Error opening file");
        return 1;
    }

    fseek(file, 0, SEEK_END);
    long file_size = ftell(file);
    rewind(file);

    char *buffer = (char *)malloc(file_size + 1);
    if (!buffer) {
        perror("Memory allocation failed");
        fclose(file);
        return 1;
    }

    fread(buffer, 1, file_size, file);
    buffer[file_size] = '\0';
    fclose(file);

    int occurrences = KMP_search(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    free(buffer);
    return 0;
}

运行结果:

tom@raspberrypi:~/test/strstr $ time ./str2 t.txt NFC
Occurrences of 'NFC': 9

real    0m0.023s
user    0m0.019s
sys     0m0.004s

好吧。。。效率并没有提升,反而还慢了1ms。。。

换了长一些的匹配字依旧。。。

tom@raspberrypi:~/test/strstr $ time ./str1 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.023s
user    0m0.019s
sys     0m0.004s
tom@raspberrypi:~/test/strstr $ time ./str2 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.024s
user    0m0.020s
sys     0m0.004s

5 glib的strstr

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

int count_occurrences(const char *text, const char *pattern) {
    int count = 0;
    const char *pos = text;
    while ((pos = strstr(pos, pattern)) != NULL) {
        count++;
        pos++; // Move forward to avoid infinite loop
    }
    return count;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
        return 1;
    }

    const char *filename = argv[1];
    const char *word = argv[2];

    int fd = open(filename, O_RDONLY);
    if (fd == -1) {
        perror("Error opening file");
        return 1;
    }

    struct stat sb;
    if (fstat(fd, &sb) == -1) {
        perror("Error getting file size");
        close(fd);
        return 1;
    }

    char *buffer = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (buffer == MAP_FAILED) {
        perror("Error mapping file");
        close(fd);
        return 1;
    }
    close(fd);

    int occurrences = count_occurrences(buffer, word);
    printf("Occurrences of '%s': %d\n", word, occurrences);

    munmap(buffer, sb.st_size);
    return 0;
}

运行结果:

tom@raspberrypi:~/test/strstr $ time ./str3 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002

real    0m0.002s
user    0m0.000s
sys     0m0.002s

好吧,比自己写的快了差不多10倍。。。

tom@raspberrypi:~/test/strstr $ time grep "CJK COMPATIBILITY IDEOGRAPH" t.txt -c
1002

real    0m0.003s
user    0m0.003s
sys     0m0.000s

比grep也快了差不多1ms。。。

6 后记

暴力strKMPgrepstrstr
耗时22ms23ms3ms2ms

本来还想详细学学KMP,通过横评可以看出,效率的关键还真不在于算法。不过grep和strstr都是开源的,回头有时间再学习学习。。。

有时候真的要感叹,刷题是一回事,工程实践又是一回事。。。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值