【微信小程序+js】分割字符串,把段落拆成句子的实践

文章讨论了一个GitHub上的例子,该例子将段落拆分成句子但丢失了句末标点。作者提出使用正则表达式的lookbehind来优化这个问题,以便在拆分句子时保留标点。此外,还探讨了在微信小程序中实现类似Pythonjieba的中文分词功能的方案,包括利用IntlAPI和处理书名及引号中的内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

正文

主要参考了github段落拆成句子这个例子。
不过它存在一个问题,拆分之后,句末的标点符号没有保留。
所以需要优化。
优化的方式:使用正则表达式里的lookbehind
参考:【JS】split分割并保留分隔符(火狐浏览器不支持)

参考:可以深入学习一下的正则表达式的先行断言(lookahead)和后行断言(lookbehind)

扩展

代码

onLoad: function (options) {
    let a = this.sentenceSplit(
      "Twenty percent impulsive clown, twenty percent professional nonconformist, sixty percent brilliant physicist, Feynman strived to be a great performer almost as much as to be a great physicist? 35858@11.com"
    );
    console.log(a)
  },

  //主函数
  sentenceSplit(originSentence) {
    //占位中间字符串
    let BOOK_PLACEHOLDER = "《%-#-@#》";
    let QUOTE_PLACEHOLDER = "“%-#-@#”。";
    let QUOTE_PLACEHOLDER_2 = "“%-#-@#”";

    if (originSentence === "") {
      console.log("this sentence is null");
      return null;
    }

    var bookList = this.getBookQuoteList('book',originSentence);
    var quoteList = this.getBookQuoteList('quote',originSentence);

    //以下两个for循环用作处理引号与书名所包含的endSymbol规则的中间代码
    var endSymbol = /[。!!??…]+/;
    for (var i = 0; i < bookList.length; i++) {
      if (endSymbol.test(bookList[i])) {
        //   console.log(bookList[i] + "-----是个假句子");
        originSentence = originSentence.replace(bookList[i], BOOK_PLACEHOLDER);
      } else {
        //   console.log(bookList[i] + "------不是假句子");
      }
    }
    for (var i = 0; i < quoteList.length; i++) {
      if (endSymbol.test(quoteList[i])) {
        originSentence = originSentence.replace(quoteList[i], QUOTE_PLACEHOLDER);
      }
    }

    //开始分句
    console.log(originSentence)
    const reg = /(?<=\。|\。”|\!”|\\.”|\\.’|\?”|\! |\?|\: |\; |\?|\!|\. )/
    console.log(reg)
    let sentenceList = originSentence.split(reg);

    console.log(sentenceList);
    var tempList = this.restoreBookAndQuote(bookList, sentenceList, BOOK_PLACEHOLDER);
    // console.log(tempList);

    tempList = this.restoreBookAndQuote(quoteList, sentenceList, QUOTE_PLACEHOLDER_2);

    sentenceList = tempList;
    // console.log(tempList);
    return sentenceList
  },


  //获取书名号引号包含的字符串数组
  getBookQuoteList(type, sentence) {
    if (type == 'book') {
      var pat = new RegExp("《([^《|》]*)》", "g");
    } else {
      var pat = new RegExp('(".*?")|(“.*?”)', "g");
    }
    let results = [];
    do {
      var res = pat.exec(sentence);
      if (res) {
        results.push(res[0]);
      }
    } while (res);
    return results;
  },

  //还原书名号与引号中内容
  restoreBookAndQuote(List, sentenceList, BookOrQuotePlaceHolder) {
    var endSymbol = /[。!!??…]+/;
    for (var i = 0; i < List.length; i++) {
      if (endSymbol.test(List[i])) {
        for (var j = 0; j < sentenceList.length; j++) {
          if (sentenceList[j].indexOf(BookOrQuotePlaceHolder) !== -1) {
            console.log(sentenceList[j]);
            let tempStr = sentenceList[j].replace(
              BookOrQuotePlaceHolder,
              List[i]
            );
            sentenceList.splice(j, 1, tempStr);
            break;
          }
        }
      }
    }
    return sentenceList;
  },
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值