memchr SSE 加速

本文介绍了一种使用SSE指令集优化memchr函数的方法,通过对比标准memchr函数,展示了该优化版本在性能上的显著提升。测试环境为Intel Xeon E3-1230处理器。

memchr_sse.s

.text
.globl memchr_sse; 
.align 4,0x90;
memchr_sse:
	movd	%rsi, %xmm1
	mov	%rdi, %rcx

	punpcklbw %xmm1, %xmm1
	test	%rdx, %rdx
	jz	L_return_null
	punpcklbw %xmm1, %xmm1

	and	$63, %rcx
	pshufd	$0, %xmm1, %xmm1

	cmp	$48, %rcx
	ja	L_crosscache

	movdqu	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax

	jnz	L_matches_1
	sub	$16, %rdx
	jbe	L_return_null
	add	$16, %rdi
	and	$15, %rcx
	and	$-16, %rdi
	add	%rcx, %rdx
	sub	$64, %rdx
	jbe	L_exit_loop
	jmp	L_loop_prolog

	.p2align 4
L_crosscache:
	and	$15, %rcx
	and	$-16, %rdi
	movdqa	(%rdi), %xmm0

	pcmpeqb	%xmm1, %xmm0
/* Check if there is a match.  */
	pmovmskb %xmm0, %eax
/* Remove the leading bytes.  */
	sar	%cl, %eax
	test	%eax, %eax
	je	L_unaligned_no_match
	/* Check which byte is a match.  */
	bsf	%eax, %eax

	sub	%rax, %rdx
	jbe	L_return_null
	add	%rdi, %rax
	add	%rcx, %rax
	ret

	.p2align 4
L_unaligned_no_match:
	add	%rcx, %rdx
	sub	$16, %rdx
	jbe	L_return_null
	add	$16, %rdi
	sub	$64, %rdx
	jbe	L_exit_loop
	.p2align 4
L_loop_prolog:
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	movdqa	48(%rdi), %xmm4
	pcmpeqb	%xmm1, %xmm4
	add	$64, %rdi
	pmovmskb %xmm4, %eax
	test	%eax, %eax
	jnz	L_matches0
	test	$0x3f, %rdi
	jz	L_align64_loop
	sub	$64, %rdx
	jbe	L_exit_loop
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	movdqa	48(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax

	add	$64, %rdi
	test	%eax, %eax
	jnz	L_matches0
	mov	%rdi, %rcx
	and	$-64, %rdi
	and	$63, %rcx
	add	%rcx, %rdx

	.p2align 4
L_align64_loop:
	sub	$64, %rdx
	jbe	L_exit_loop
	movdqa	(%rdi), %xmm0
	movdqa	16(%rdi), %xmm2
	movdqa	32(%rdi), %xmm3
	movdqa	48(%rdi), %xmm4

	pcmpeqb	%xmm1, %xmm0
	pcmpeqb	%xmm1, %xmm2
	pcmpeqb	%xmm1, %xmm3
	pcmpeqb	%xmm1, %xmm4

	pmaxub	%xmm0, %xmm3
	pmaxub	%xmm2, %xmm4
	pmaxub	%xmm3, %xmm4
	pmovmskb %xmm4, %eax

	add	$64, %rdi

	test	%eax, %eax
	jz	L_align64_loop
	sub	$64, %rdi

	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3

	pcmpeqb	48(%rdi), %xmm1
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32
	pmovmskb %xmm1, %eax
	bsf	%eax, %eax
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L_exit_loop:
	add	$32, %rdx
	jle	L_exit_loop_32
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches
	movdqa	16(%rdi), %xmm2
	pcmpeqb	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L_matches16
	movdqa	32(%rdi), %xmm3
	pcmpeqb	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L_matches32_1
	sub	$16, %rdx
	jle	L_return_null
	pcmpeqb	48(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L_matches48_1
	xor	%rax, %rax
	ret

	.p2align 4
L_exit_loop_32:
	add	$32, %rdx
	movdqa	(%rdi), %xmm0
	pcmpeqb	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L_matches_1
	sub	$16, %rdx
	jbe	L_return_null
	pcmpeqb	16(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L_matches16_1
	xor	%rax, %rax
	ret

	.p2align 4
L_matches0:
	bsf	%eax, %eax
	lea	-16(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches:
	bsf	%eax, %eax
	add	%rdi, %rax
	ret

	.p2align 4
L_matches16:
	bsf	%eax, %eax
	lea	16(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches32:
	bsf	%eax, %eax
	lea	32(%rax, %rdi), %rax
	ret

	.p2align 4
L_matches_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	add	%rdi, %rax
	ret

	.p2align 4
L_matches16_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	16(%rdi, %rax), %rax
	ret

	.p2align 4
L_matches32_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	32(%rdi, %rax), %rax
	ret

	.p2align 4
L_matches48_1:
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L_return_null
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L_return_null:
	xor	%rax, %rax
	ret
.type memchr_sse, @function;
.size memchr_sse, .-memchr_sse;

测试stub

stub.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include "common.h"
extern void *memchr_sse(const void *s, int c, size_t n);
int main(int argc, char **argv)
{
	char text[1024] = {0};
	void *result = NULL;
	uint64_t begin, end;

	memset(text, 'A', 1024);
	text[1022] = '\r';
	begin = get_cycle_count();
	//result = memchr_sse(text, '\r', 1024);
	result = memchr(text, '\r', 1024);
	end = get_cycle_count();

	if (result){
		printf("result @ %u cost %lu\n", result - (void *)text, end - begin);
	}
	return 0;
}


编译

gcc -march=corei7 -O3 memchr_sse.s stub.c -o stub

测试平台:

Intel(R) Xeon(R) CPU E31230 @ 3.20GHz

memchr 测试结果

result @ 1022 cost 1404
# ./stub
result @ 1022 cost 1600
# ./stub
result @ 1022 cost 1452
# ./stub
result @ 1022 cost 1388
# ./stub
result @ 1022 cost 1440

memchr_sse 测试结果
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 568
# ./stub
result @ 1022 cost 572
# ./stub
result @ 1022 cost 612
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 520


int swIpiproxyPrintStrParse(char *showInfo, char *printStr, int printStrLen, int aligned) { if (!showInfo || !printStr || printStrLen <= 0) { return ERROR; } const char ethStr[] = "tp-eth."; const char loStr[] = "lo."; const int ethLen = 7; // strlen("tp-eth.") == 7 const int loLen = 3; // strlen("lo.") == 3 int len = (int)strlen(showInfo); int pos = 0; // 当前在 showInfo 中的位置 int outPos = 0; // 当前在 printStr 中写入的位置 char tempBuf[32]; // 足够存放 "VLAN1234", "Po500", "Loopback10" 等 while (pos < len && outPos < printStrLen) { // 处理换行符 if (showInfo[pos] == '\n') { if (outPos + 2 >= printStrLen) break; printStr[outPos++] = '\r'; printStr[outPos++] = '\n'; pos++; continue; } // 匹配 tp-eth. if (pos + ethLen <= len && strncmp(showInfo + pos, ethStr, ethLen) == 0) { const char *numStart = showInfo + pos + ethLen; char *numEnd; long vlanId = strtol(numStart, &numEnd, 10); int numLen = (int)(numEnd - numStart); // 必须成功解析数字且至少一位 if (numLen == 0 || vlanId < 0 || vlanId > 4095) { printStr[outPos++] = showInfo[pos++]; continue; } NETIF_ID netifid = {0}; if (swipiProxyNetIfIdGetByVid((int)vlanId, &netifid) != ERR_NO_ERROR) { return ERROR; } // 构造替换字符串 switch (netifid.netIftype) { case NETIF_TYPE_VLAN: snprintf(tempBuf, sizeof(tempBuf), "VLAN%d", netifid.netIfData.vlanId); break; case NETIF_TYPE_ROUTED_PORT: usUp2Str(netifid.netIfData.userPort, tempBuf, "UP"); break; case NETIF_TYPE_PORT_CHANNEL: snprintf(tempBuf, sizeof(tempBuf), "Po%d", netifid.netIfData.portChannelId); break; default: snprintf(tempBuf, sizeof(tempBuf), "unknown"); break; } int replaceLen = (int)strlen(tempBuf); int matchLen = ethLen + numLen; // 实际匹配长度 // 写入替换内容 if (outPos + replaceLen >= printStrLen) break; memcpy(printStr + outPos, tempBuf, replaceLen); outPos += replaceLen; // 对齐:补空格 if (aligned) { int padSpace = matchLen - replaceLen; for (int i = 0; i < padSpace && outPos < printStrLen - 1; i++) { printStr[outPos++] = ' '; } } pos += matchLen; continue; } // 匹配 lo. if (pos + loLen <= len && strncmp(showInfo + pos, loStr, loLen) == 0) { const char *numStart = showInfo + pos + loLen; char *numEnd; long loopbackId = strtol(numStart, &numEnd, 10); int numLen = (int)(numEnd - numStart); if (numLen == 0 || loopbackId < 0 || loopbackId > 1023) { printStr[outPos++] = showInfo[pos++]; continue; } int matchLen = loLen + numLen; snprintf(tempBuf, sizeof(tempBuf), "Loopback%ld", loopbackId); int replaceLen = (int)strlen(tempBuf); // 写入替换内容 if (outPos + replaceLen >= printStrLen) break; memcpy(printStr + outPos, tempBuf, replaceLen); outPos += replaceLen; pos += matchLen; if (aligned) { int skipSpace = replaceLen - matchLen; for (int i = 0; i < skipSpace && pos < len && showInfo[pos] == ' '; i++) { pos++; } } continue; } // 普通字符直接复制 printStr[outPos++] = showInfo[pos++]; } // 安全终止输出字符串(如果空间允许) if (outPos < printStrLen) { printStr[outPos] = '\0'; } else if (printStrLen > 0) { printStr[printStrLen - 1] = '\0'; // 强制截断 } return ERR_NO_ERROR; } 针对上述代码功能场景,有哪些算法可以优化匹配算法的时间复杂度,优化效率,请帮我例举并比较算法效率和时间复杂度
最新发布
10-10
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值