1 前言
很久之前,大概毕业一两年的时候,当时对写算法有点兴趣,于是自己写了一个strstr,用C写的,想破头也没什么特别的算法,最后也差不多是遍历,找到第一个字符后依次匹配。写了两天差不多完工,但是效率比当时MFC自带的函数查了非常远,大概好几倍吧。
所以后面也放弃了这种自己写库函数的想法了。
最近看leetcode,看到KMP算法,所以也想学一下。找了一个普通的txt文件,大概是2M多。找寻里面的NFC关键字。运行设备是树莓派5(4GB版本)。
2 grep
tom@raspberrypi:~/test/strstr $ time grep NFC t.txt
# source; NFC; NFD; NFKC; NFKD
# NFC
# c2 == toNFC(c1) == toNFC(c2) == toNFC(c3)
# c4 == toNFC(c4) == toNFC(c5)
# X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms.
real 0m0.005s
user 0m0.005s
sys 0m0.000s
使用最常见的grep,耗时大概是5ms。
3 常规的遍历
代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *my_strstr(const char *haystack, const char *needle) {
if (!*needle) return (char *)haystack; // 如果 needle 为空,返回 haystack
int count = 0;
while (*haystack) {
const char *h = haystack;
const char *n = needle;
while (*h && *n && *h == *n) {
h++;
n++;
}
if (!*n) return (char *)haystack; // 找到匹配的子字符串,返回起始位置
haystack++; // 继续搜索
}
return NULL; // 没有找到,返回 NULL
}
int count_occurrences(const char *text, const char *word) {
int count = 0;
const char *ptr = text;
while ((ptr = my_strstr(ptr, word)) != NULL) {
count++;
ptr++; // 继续搜索下一个匹配项
}
return count;
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
return 1;
}
const char *filename = argv[1];
const char *word = argv[2];
FILE *file = fopen(filename, "r");
if (!file) {
perror("Error opening file");
return 1;
}
fseek(file, 0, SEEK_END);
long file_size = ftell(file);
rewind(file);
char *buffer = (char *)malloc(file_size + 1);
if (!buffer) {
perror("Memory allocation failed");
fclose(file);
return 1;
}
fread(buffer, 1, file_size, file);
buffer[file_size] = '\0';
fclose(file);
int occurrences = count_occurrences(buffer, word);
printf("Occurrences of '%s': %d\n", word, occurrences);
free(buffer);
return 0;
}
耗时如下:
tom@raspberrypi:~/test/strstr $ time ./a.out t.txt NFC
Occurrences of 'NFC': 9
real 0m0.022s
user 0m0.022s
sys 0m0.000s
大概是22毫秒。比grep慢了4倍多,还没有显示。
4 KMP算法
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// 计算 KMP 前缀表
void computeLPSArray(const char *pattern, int M, int *lps) {
int length = 0;
lps[0] = 0;
int i = 1;
while (i < M) {
if (pattern[i] == pattern[length]) {
length++;
lps[i] = length;
i++;
} else {
if (length != 0) {
length = lps[length - 1];
} else {
lps[i] = 0;
i++;
}
}
}
}
// 使用 KMP 进行字符串匹配
int KMP_search(const char *text, const char *pattern) {
int N = strlen(text);
int M = strlen(pattern);
if (M == 0) return 0;
int *lps = (int *)malloc(M * sizeof(int));
if (!lps) {
perror("Memory allocation failed");
return -1;
}
computeLPSArray(pattern, M, lps);
int i = 0, j = 0, count = 0;
while (i < N) {
if (pattern[j] == text[i]) {
i++;
j++;
}
if (j == M) {
count++;
j = lps[j - 1];
} else if (i < N && pattern[j] != text[i]) {
if (j != 0) {
j = lps[j - 1];
} else {
i++;
}
}
}
free(lps);
return count;
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
return 1;
}
const char *filename = argv[1];
const char *word = argv[2];
FILE *file = fopen(filename, "r");
if (!file) {
perror("Error opening file");
return 1;
}
fseek(file, 0, SEEK_END);
long file_size = ftell(file);
rewind(file);
char *buffer = (char *)malloc(file_size + 1);
if (!buffer) {
perror("Memory allocation failed");
fclose(file);
return 1;
}
fread(buffer, 1, file_size, file);
buffer[file_size] = '\0';
fclose(file);
int occurrences = KMP_search(buffer, word);
printf("Occurrences of '%s': %d\n", word, occurrences);
free(buffer);
return 0;
}
运行结果:
tom@raspberrypi:~/test/strstr $ time ./str2 t.txt NFC
Occurrences of 'NFC': 9
real 0m0.023s
user 0m0.019s
sys 0m0.004s
好吧。。。效率并没有提升,反而还慢了1ms。。。
换了长一些的匹配字依旧。。。
tom@raspberrypi:~/test/strstr $ time ./str1 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002
real 0m0.023s
user 0m0.019s
sys 0m0.004s
tom@raspberrypi:~/test/strstr $ time ./str2 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002
real 0m0.024s
user 0m0.020s
sys 0m0.004s
5 glib的strstr
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
int count_occurrences(const char *text, const char *pattern) {
int count = 0;
const char *pos = text;
while ((pos = strstr(pos, pattern)) != NULL) {
count++;
pos++; // Move forward to avoid infinite loop
}
return count;
}
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s <file> <string>\n", argv[0]);
return 1;
}
const char *filename = argv[1];
const char *word = argv[2];
int fd = open(filename, O_RDONLY);
if (fd == -1) {
perror("Error opening file");
return 1;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
perror("Error getting file size");
close(fd);
return 1;
}
char *buffer = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (buffer == MAP_FAILED) {
perror("Error mapping file");
close(fd);
return 1;
}
close(fd);
int occurrences = count_occurrences(buffer, word);
printf("Occurrences of '%s': %d\n", word, occurrences);
munmap(buffer, sb.st_size);
return 0;
}
运行结果:
tom@raspberrypi:~/test/strstr $ time ./str3 t.txt "CJK COMPATIBILITY IDEOGRAPH"
Occurrences of 'CJK COMPATIBILITY IDEOGRAPH': 1002
real 0m0.002s
user 0m0.000s
sys 0m0.002s
好吧,比自己写的快了差不多10倍。。。
tom@raspberrypi:~/test/strstr $ time grep "CJK COMPATIBILITY IDEOGRAPH" t.txt -c
1002
real 0m0.003s
user 0m0.003s
sys 0m0.000s
比grep也快了差不多1ms。。。
6 后记
| 暴力str | KMP | grep | strstr | |
| 耗时 | 22ms | 23ms | 3ms | 2ms |
本来还想详细学学KMP,通过横评可以看出,效率的关键还真不在于算法。不过grep和strstr都是开源的,回头有时间再学习学习。。。
有时候真的要感叹,刷题是一回事,工程实践又是一回事。。。
24万+

被折叠的 条评论
为什么被折叠?



