拆分字符串string中的词变为数组

本文介绍了一个强大的字符串处理函数,能够智能地将复杂或复合的Unicode和ASCII字符串拆分为单词数组。该函数支持自定义正则表达式模式,适用于多种场景,如文本分析、词频统计等。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

/**
 * 拆分字符串string中的词为数组
 * Splits `string` into an array of its words
 * @param {string} [string=''] The string to inspect
 * @param {RegExp|string} [pattern] The pattern to match words
 * @returns {Array} Returns the words of `string`
 * @example
 * words('fred, barney, & pebbles')
 * // => ['fred', 'barney', 'pebbles']
 * words('fred, barney, & pebbles', /[^, ]+/g)
 * // => ['fred', 'barney', '&', 'pebbles']
 * words("\uD842\uDFB7\uD842\uDFB7")
 * // => ['?','?']
 * words("--foo-bar--")
 * // => ['foo', 'bar']
 * words("__FOO_BAR__")
 * // => ['FOO', 'BAR']
 */

import toString from "./toString"

// Used to compose unicode character classes.
var rsAstralRange = "\\ud800-\\udfff",
  rsComboMarksRange = "\\u0300-\\u036f",
  reComboHalfMarksRange = "\\ufe20-\\ufe2f",
  rsComboSymbolsRange = "\\u20d0-\\u20ff",
  rsComboRange =
    rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange,
  rsDingbatRange = "\\u2700-\\u27bf",
  rsLowerRange = "a-z\\xdf-\\xf6\\xf8-\\xff",
  rsMathOpRange = "\\xac\\xb1\\xd7\\xf7",
  rsNonCharRange = "\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf",
  rsPunctuationRange = "\\u2000-\\u206f",
  rsSpaceRange =
    " \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000",
  rsUpperRange = "A-Z\\xc0-\\xd6\\xd8-\\xde",
  rsVarRange = "\\ufe0e\\ufe0f",
  rsBreakRange =
    rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange

// Used to compose unicode capture groups.
var rsApos = "['\u2019]",
  rsAstral = "[" + rsAstralRange + "]",
  rsBreak = "[" + rsBreakRange + "]",
  rsCombo = "[" + rsComboRange + "]",
  rsDigits = "\\d+",
  rsDingbat = "[" + rsDingbatRange + "]",
  rsLower = "[" + rsLowerRange + "]",
  rsMisc =
    "[^" +
    rsAstralRange +
    rsBreakRange +
    rsDigits +
    rsDingbatRange +
    rsLowerRange +
    rsUpperRange +
    "]",
  rsFitz = "\\ud83c[\\udffb-\\udfff]",
  rsModifier = "(?:" + rsCombo + "|" + rsFitz + ")",
  rsNonAstral = "[^" + rsAstralRange + "]",
  rsRegional = "(?:\\ud83c[\\udde6-\\uddff]){2}",
  rsSurrPair = "[\\ud800-\\udbff][\\udc00-\\udfff]",
  rsUpper = "[" + rsUpperRange + "]",
  rsZWJ = "\\u200d"

// Used to compose unicode regexes.
var rsMiscLower = "(?:" + rsLower + "|" + rsMisc + ")",
  rsMiscUpper = "(?:" + rsUpper + "|" + rsMisc + ")",
  rsOptContrLower = "(?:" + rsApos + "(?:d|ll|m|re|s|t|ve))?",
  rsOptContrUpper = "(?:" + rsApos + "(?:D|LL|M|RE|S|T|VE))?",
  reOptMod = rsModifier + "?",
  rsOptVar = "[" + rsVarRange + "]?",
  rsOptJoin =
    "(?:" +
    rsZWJ +
    "(?:" +
    [rsNonAstral, rsRegional, rsSurrPair].join("|") +
    ")" +
    rsOptVar +
    reOptMod +
    ")*",
  rsOrdLower = "\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])",
  rsOrdUpper = "\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])",
  rsSeq = rsOptVar + reOptMod + rsOptJoin,
  rsEmoji = "(?:" + [rsDingbat, rsRegional, rsSurrPair].join("|") + ")" + rsSeq,
  rsSymbol =
    "(?:" +
    [
      rsNonAstral + rsCombo + "?",
      rsCombo,
      rsRegional,
      rsSurrPair,
      rsAstral
    ].join("|") +
    ")"

/** Used to match complex or compound words. */
var reUnicodeWord = new RegExp(
  [
    rsUpper +
      "?" +
      rsLower +
      "+" +
      rsOptContrLower +
      "(?=" +
      [rsBreak, rsUpper, "$"].join("|") +
      ")",
    rsMiscUpper +
      "+" +
      rsOptContrUpper +
      "(?=" +
      [rsBreak, rsUpper + rsMiscLower, "$"].join("|") +
      ")",
    rsUpper + "?" + rsMiscLower + "+" + rsOptContrLower,
    rsUpper + "+" + rsOptContrUpper,
    rsOrdUpper,
    rsOrdLower,
    rsDigits,
    rsEmoji
  ].join("|"),
  "g"
)

// Splits a Unicode `string` into an array of its words.
function unicodeWords(string) {
  return string.match(reUnicodeWord) || []
}

// Used to match words composed of alphanumeric characters.
var reAsciiWord = /[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+/g

// Splits an ASCII `string` into an array of its words.
function asciiWords(string) {
  return string.match(reAsciiWord) || []
}

// Used to detect strings that need a more robust regexp to match words.
var reHasUnicodeWord = /[a-z][A-Z]|[A-Z]{2}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/

// Checks if `string` contains a word composed of Unicode symbols.
function hasUnicodeWord(string) {
  return reHasUnicodeWord.test(string)
}

function words(string, pattern) {
  string = toString(string)
  if (pattern === undefined) {
    // 区分Unicode与ASCII
    return hasUnicodeWord(string) ? unicodeWords(string) : asciiWords(string)
  }
  return string.match(pattern) || []
}

export default words
/**
 * Converts `value` to a string. An empty string is returned for `null` and `undefined` values.
 * The sign of `-0` is preserved.
 * 转换`value`成字符串,`null`和`undefined`返回空字符串,`-0`转成'-0'
 * @param {*} value The value to convert
 * @retuens {string} Returns the converted string
 * @example
 * toString(null)
 * // => ''
 * toString(-0)
 * // => '-0'
 * toString([1, 2, 3])
 * // => '1, 2, 3'
 * toString({a: 1})
 * // => [object Object]
 */

const symbolTag = "[object Symbol]"

function isObjectLike(value) {
  return typeof value == "object" && value !== null;
}

function isSymbol(value) {
  return (
    typeof value === "symbol" ||
    (isObjectLike(value) && Object.prototype.toString.call(value) === symbolTag)
  )
}

function arrayMap(array, iteratee) {
  var index = -1,
    length = array == null ? 0 : array.length,
    result = Array(length)

  while (++index < length) {
    result[index] = iteratee(array[index], index, array)
  }
  return result
}

function baseToString(value) {
  if (typeof value === "string") {
    return value
  }
  if (Array.isArray(value)) {
    return arrayMap(value, baseToString) + ""
  }
  if (isSymbol(value)) {
    return Symbol.prototype.toString
      ? Symbol.prototype.toString.call(value)
      : ""
  }
  var result = value + ""
  return result == "0" && 1 / value == -Infinity ? "-0" : result
}

function toString(value) {
  return value == null ? "" : baseToString(value)
}

export default toString

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值