正文
主要参考了github段落拆成句子这个例子。
不过它存在一个问题,拆分之后,句末的标点符号没有保留。
所以需要优化。
优化的方式:使用正则表达式里的lookbehind
参考:【JS】split分割并保留分隔符(火狐浏览器不支持)
参考:可以深入学习一下的正则表达式的先行断言(lookahead)和后行断言(lookbehind)
扩展
- 在微信小程序中实现分词的方案(未实践)
1.求教:如何在小程序里面实现诸如python中jieba的中文分词功能? - 你可能不知道的字符串分割技巧:用的是Intl API 小程序用不了的
代码
onLoad: function (options) {
let a = this.sentenceSplit(
"Twenty percent impulsive clown, twenty percent professional nonconformist, sixty percent brilliant physicist, Feynman strived to be a great performer almost as much as to be a great physicist? 35858@11.com"
);
console.log(a)
},
//主函数
sentenceSplit(originSentence) {
//占位中间字符串
let BOOK_PLACEHOLDER = "《%-#-@#》";
let QUOTE_PLACEHOLDER = "“%-#-@#”。";
let QUOTE_PLACEHOLDER_2 = "“%-#-@#”";
if (originSentence === "") {
console.log("this sentence is null");
return null;
}
var bookList = this.getBookQuoteList('book',originSentence);
var quoteList = this.getBookQuoteList('quote',originSentence);
//以下两个for循环用作处理引号与书名所包含的endSymbol规则的中间代码
var endSymbol = /[。!!??…]+/;
for (var i = 0; i < bookList.length; i++) {
if (endSymbol.test(bookList[i])) {
// console.log(bookList[i] + "-----是个假句子");
originSentence = originSentence.replace(bookList[i], BOOK_PLACEHOLDER);
} else {
// console.log(bookList[i] + "------不是假句子");
}
}
for (var i = 0; i < quoteList.length; i++) {
if (endSymbol.test(quoteList[i])) {
originSentence = originSentence.replace(quoteList[i], QUOTE_PLACEHOLDER);
}
}
//开始分句
console.log(originSentence)
const reg = /(?<=\。|\。”|\!”|\\.”|\\.’|\?”|\! |\?|\: |\; |\?|\!|\. )/
console.log(reg)
let sentenceList = originSentence.split(reg);
console.log(sentenceList);
var tempList = this.restoreBookAndQuote(bookList, sentenceList, BOOK_PLACEHOLDER);
// console.log(tempList);
tempList = this.restoreBookAndQuote(quoteList, sentenceList, QUOTE_PLACEHOLDER_2);
sentenceList = tempList;
// console.log(tempList);
return sentenceList
},
//获取书名号引号包含的字符串数组
getBookQuoteList(type, sentence) {
if (type == 'book') {
var pat = new RegExp("《([^《|》]*)》", "g");
} else {
var pat = new RegExp('(".*?")|(“.*?”)', "g");
}
let results = [];
do {
var res = pat.exec(sentence);
if (res) {
results.push(res[0]);
}
} while (res);
return results;
},
//还原书名号与引号中内容
restoreBookAndQuote(List, sentenceList, BookOrQuotePlaceHolder) {
var endSymbol = /[。!!??…]+/;
for (var i = 0; i < List.length; i++) {
if (endSymbol.test(List[i])) {
for (var j = 0; j < sentenceList.length; j++) {
if (sentenceList[j].indexOf(BookOrQuotePlaceHolder) !== -1) {
console.log(sentenceList[j]);
let tempStr = sentenceList[j].replace(
BookOrQuotePlaceHolder,
List[i]
);
sentenceList.splice(j, 1, tempStr);
break;
}
}
}
}
return sentenceList;
},