PCRE简单应用

本文介绍了一种使用PCRE正则表达式对文本进行断句的方法,通过匹配特定的标点符号来实现文本的精确分割。该方法能够有效处理包含多种标点符号的文本,并展示了完整的C语言实现代码。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

有那么一点点明白PCRE强大的地方了,正则表达式确实很牛掰呀,对文本行进行断句,断句的符号即为标点符号,代码如下:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "pcre.h"
#include <unistd.h>
#include <iostream>
#include <string>
#include <vector>

#include "ul_ccode.h" 
#include "ul_conf.h" 
#include "ul_log.h" 

#define N 10240
#define M 30

int main(int argc, char *argv[])
{
    if(argc < 3)
    {
        printf("Usage : %s infile outfile\n", argv[0]);
        exit(-1);
    }

    FILE *fp_in;
    FILE *fp_out;

    fp_in = fopen(argv[1], "rb");
    fp_out = fopen(argv[2], "wb");
    if(NULL == fp_in || NULL == fp_out)
    {
        printf("FILE open failure\n");
        exit(-1);
    }

    const char *error;
    pcre *re;
    int erroffset;
    int ovector[M];
    int rc = 0, i = 0;
    char buffer[N];
    memset(buffer, 0, N);

    // pattern
    //char pattern[N] = "(0|1|2|3|4|5|6|7|8|9){5,}";
    char pattern[N] = "(,|。| |\t|“|”|;){1,}";
    printf("%s\n", pattern);
    ul_trans2bj(pattern, buffer);
    ul_trans2lower(buffer, pattern);
    printf("%s\n", pattern);

    re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
    if(NULL == re)
    {
        printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
        exit(-1);
    }

    char line[N], line1[N], line2[N], line_out[N];
    char *p1 = NULL, *p2 = NULL;
    int len = 0;
    int num = 0;
    std::string str;
    std::vector<std::string> svec;
    std::vector<std::string>::iterator iter;

    while(fgets(line, N, fp_in))
    {
        //strncpy(line1, line, N);
        len = strlen(line);
        while((line[len - 1] == '\n' || line[len - 1] == ' ' || line[len - 1] == '\t') && len > 0)
            -- len;

        line[len] = '\0';

        printf("%s\n", line);

        ul_trans2bj(line, line1);
        //printf("%s\n", line1);
        ul_trans2lower(line1, line2);
        //printf("%s\n", line2);
        p2 = &line2[0];
        svec.clear();
        while(p2 != NULL)
        {
            p1 = p2;

            rc = pcre_exec(re, NULL, p2, strlen(p2), 0, 0, ovector, M);
            if(rc < 0)
            {
                //printf("NO match...\n");
                //fprintf(fp_out, "%s\n", line1);
                //printf("%s\n", p2);
                strncpy(line_out, p2, strlen(p2));
                line_out[strlen(p2)] = '\0';
                str = std::string(line_out);
                svec.push_back(str);

                p2 = NULL;
                continue;
            }

            len = strlen(p2);
            if(ovector[0] > 0)
            {
                strncpy(line_out, p2, ovector[0]);
                line_out[ovector[0]] = '\0';
                str = std::string(line_out);
                svec.push_back(str);
            }

            if(ovector[1] < len)
            {
                p2 = p1 + ovector[1];
                //strncpy(line_out, p2, len - ovector[1]);
                //line_out[len - ovector[1]] = '\0';
                //str = std::string(line_out);
                //svec.push_back(str);
            }else{
                p2 = NULL;
                continue;
            }
        }
        if(svec.size() > 0)
        {
            for(iter = svec.begin(); iter != svec.end(); ++ iter)
                printf("%s\n", (*iter).c_str());
        }
        //strncpy(line1, line, N);
        //if(0 == len)
        //    continue;
        //line[len] = '\0';
        //strncpy(line1, line, N);
        fprintf(fp_out, "%s\n", line2);

    }
    fclose(fp_in);
    fclose(fp_out);
    pcre_free(re);
    return 0;
}

其中的两个函数的意思,
ul_trans2bj(pattern, buffer);
//将字符串转为半角字符串
ul_trans2lower(buffer, pattern);
//转为小写字符串

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值