名称匹配的主要问题之一是出错的可能性。 人们拼写同一个名字(错别字)的方式有很多,他们都听别人说的话。 有多种方法可以破坏自由格式语言数据。 当您需要搜索/匹配不良数据时,这会引起很多麻烦。
有许多不同的解决方法。 如
Levenshtein算法,计算使一个字符串与另一字符串匹配需要进行多少次编辑。 或NGram算法 ,它检查字符串组成的较小序列,并将其与插补字符串的序列进行比较。 还有一些语音算法,可以根据字符串的“声音”对字符串进行编码。 类似于SoundEx或Double Metaphone算法。 什么是双重中继器?Double Metaphone是一种语音算法,它接受一个字符串,并产生两种关于如何用口语发音的编码。 因为一个单词有时可以以多种方式发音,所以产生两种编码。
它用于英语,包含许多关于字符串“声音”的规则。 它还试图解释斯拉夫语,日耳曼语,凯尔特语,希腊语,法语,意大利语,西班牙语,汉语和其他起源的罗马化单词的英语发音中的差异。
它是如何工作的?该算法通过查看数百个语音规则来创建语音字符串。 它使用的规则的一些示例是:
- P和B被编码为相同的声音。 除非P后跟H,然后将H编码为F声音。
- SCI被编码为S声音,而SCO被编码为SK声音。
- T和D被编码为相同的声音。 除非T后面跟随一个H,然后将H编码为由数字0表示的theta声音。
该算法会产生2个输入单词的潜在发音。 阈值参数确定每个发音的最大长度。 因此,LEFT(输出,阈值)是主要发音,而RIGHT(输出,阈值)是潜在的不同发音。 因此,当编码产生的语音的字符数少于阈值时的空格。
该算法被编码为一次编码一个单词,并且不处理字符串中的非字母字符。 因此,可能应在运行代码之前将其从字符串中提取出来。 在进行其他任何处理之前,将其提取或构建到函数中。
这是设计使然。 当使用较低的阈值时,较长的单词会被截断,并可能错误地与较短的单词匹配。 例如,阈值为4,“ foxtrot”和“ fixed”具有相同的编码。
因此,我将其作为参数而不是硬编码阈值,因此用户可以根据需要选择使用较大的阈值。
VBScript中的示例实现此实现采用字符串s输入和长度i输入,并输出该字符串的2种可能的不同语音编码,每种长度为I。
例如,使用
DoubleMetaphone("Smith", 4)将返回SM0 XMT ,包括空格。
您可以使用它来预先计算表中名称或单词的发音,然后将2种不同的语音代码与表中的所有其他语音代码进行比较,以查找听起来相似的名称或单词。
Option Explicit
MsgBox(DoubleMetaphone(InputBox("Enter String"), 6))
Function DoubleMetaphone(strOriginal, intThreshhold)
Dim isSlavoGermanic, strPrimary, strSecondary, i, intJump, iB
Dim intLength, cP, cS, arr, x, intPad
isSlavoGermanic = False
iB = 4
intPad = 6
x = iB
intLength = Len(strOriginal) + iB - 1
strOriginal = UCase(strOriginal)
If (InStr(strOriginal, "W") + InStr(strOriginal, "K") + InStr(strOriginal, "CZ") + InStr(strOriginal, "WITZ")) <> 0 Then
isSlavoGermanic = True
End If
ReDim arr(intLength + intPad + 1)
For i = 0 To iB-1
arr(i) = vbTab
Next
For i = iB To intLength
arr(i) = Mid(strOriginal, i-iB+1, 1)
Next
For i = intLength+1 To UBound(arr)
arr(i) = vbTab
Next
Select Case (arr(x) & arr(x+1))
Case "AC"
strPrimary = "AKS"
strSecondary = "AKS"
x = x + 4
Case "GN", "KN", "PN", "PS"
x = x + 1
Case "HA", "HE", "HI", "HO", "HU", "HY"
strPrimary = "H"
strSecondary = "H"
x = x + 2
Case "WA", "WE", "WI", "WO", "WU", "WY"
strPrimary = "A"
strSecondary = "F"
x = x + 2
Case "WH"
strPrimary = "A"
strSecondary = "A"
x = x + 1
Case "SM", "SN", "SL", "SW"
strPrimary = "S"
strSecondary = "X"
x = x + 1
Case "GY"
strPrimary = "K"
strSecondary = "J"
x = x + 2
End Select
If x = iB Then
If arr(x) & arr(x+1) & arr(x+2) & arr(x+3) = "JOSE" Then
If (x = iB And arr(x+4) = " ") Then
strPrimary = "HS"
strSecondary = "HS"
x = x + 4
End If
ElseIf arr(x) & arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) = "SUGAR" Then
strPrimary = "XK"
strSecondary = "SK"
x = x + 5
ElseIf arr(x) & arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) & arr(x+5) = "CAESAR" Then
strPrimary = "SSR"
strSecondary = "SSR"
x = x + 6
ElseIf (arr(x) & arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) & arr(x+5) = "CHARAC" Or _
arr(x) & arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) & arr(x+5) = "CHARIS" Or _
arr(x) & arr(x+1) & arr(x+2) & arr(x+3) = "CHOR" Or _
arr(x) & arr(x+1) & arr(x+2) & arr(x+3) = "CHYM" Or _
arr(x) & arr(x+1) & arr(x+2) & arr(x+3) = "CHEM") And _
arr(x) & arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) <> "CHORE" Then
strPrimary = "K"
strSecondary = "K"
x = x + 2
End If
End If
If x = iB Then
Select Case arr(x) & arr(x+1) & arr(x+2)
Case "GES", "GEP", "GEB", "GEL", "GEY", "GIB", "GIL", "GIN", "GIE", "GEI", "GER"
strPrimary = "K"
strSecondary = "J"
x = x + 2
Case "GHI"
strPrimary = "J"
strSecondary = "J"
x = x + 3
Case "AGN", "EGN", "IGN", "OGN", "UGN", "UGY"
If Not isSlavoGermanic Then
strPrimary = "AKN"
strSecondary = "AN"
x = x + 3
End If
End Select
End If
If x = iB Then
Select Case arr(x)
Case "X"
strPrimary = "S"
strSecondary = "S"
x = x + 1
Case "A", "E", "I", "O", "U", "Y"
strPrimary = "A"
strSecondary = "A"
x = x + 1
Case "J"
strPrimary = "J"
strSecondary = "A"
x = x + 1
End Select
End If
Do While x <= intLength
If Len(strPrimary) >= intThreshhold Then
Exit Do
End If
intJump = 1
cP = arr(x)
cS = arr(x)
Select Case arr(x)
Case "A", "E", "I", "O", "U", "Y"
cP = ""
cS = ""
Case "B"
cP = "P"
cS = "P"
Case "Ç"
cP = "S"
cS = "S"
Case "C"
If x > iB+1 And arr(x-2) <> "A" And arr(x-2) <> "E" And arr(x-2) <> "I" And arr(x-2) <> "O" And arr(x-2) <> "U" And _
arr(x-2) <> "Y" And arr(x-1) & arr(x+1) = "AH" And ((arr(x+2) <> "I" And arr(x+2) <> "E") Or _
arr(x-2) & arr(x+2) & arr(x+3) = "BER" Or arr(x-2) & arr(x+2) & arr(x+3) = "MER") Then
cP = "K"
cS = "K"
intJump = 2
ElseIf arr(x+1) & arr(x+2) & arr(x+3) = "HIA" Then
cP = "K"
cS = "K"
intJump = 4
ElseIf arr(x+1) = "H" Then
If x > iB And arr(x+2) & arr(x+3) = "AE" Then
cP = "K"
cS = "X"
intJump = 2
ElseIf arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) = "VAN " Or _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) = "VON " Or _
arr(iB) & arr(iB+1) & arr(iB+2) = "SCH" Or arr(x+2) = "T" Or arr(x+2) = "S" Or _
arr(x-2) & arr(x-1) & arr(x+1) & arr(x+2) & arr(x+3) = "ORHES" Or _
arr(x-2) & arr(x-1) & arr(x+1) & arr(x+2) & arr(x+3) = "ARHIT" Or _
arr(x-2) & arr(x-1) & arr(x+1) & arr(x+2) & arr(x+3) = "ORHID" Or _
((arr(x-2) = "A" Or arr(x-2) = "E" Or arr(x-2) = "O" Or arr(x-2) = "U" Or x = iB) And _
(arr(x+2) = "L" Or arr(x+2) = "R" Or arr(x+2) = "N" Or arr(x+2) = "M" Or arr(x+2) = "B" Or _
arr(x+2) = "H" Or arr(x+2) = "F" Or arr(x+2) = "V" Or arr(x+2) = "W" Or arr(x+2) = " "))Then
cP = "K"
cS = "K"
intJump = 2
Else
intJump = 2
If x > iB Then
If arr(iB) & arr(iB+1) = "MC" Then
cP = "K"
cS = "K"
Else
cP = "X"
cS = "K"
End If
Else
cP = "X"
cS = "X"
End If
End If
ElseIf arr(x+1) = "Z" And arr(x-2) & arr(x-1) <> "WI" Then
cP = "S"
cS = "X"
intJump = 2
ElseIf arr(x+1) & arr(x+2) & arr(x+2) = "CIA" Then
cP = "X"
cS = "X"
intJump = 3
ElseIf arr(x+1) = "C" And Not (x = iB+1 And arr(iB) = "M") Then
If (arr(x+2) = "I" Or arr(x+2) = "E" Or arr(x+2) = "H") And arr(x+2) & arr(x+3) <> "HU" Then
If arr(x-1) & arr(x+1) & arr(x+2) & arr(x+3) = "UCEE" Or _
arr(x-1) & arr(x+1) & arr(x+2) & arr(x+3) = "UCES" Then
cP = "KS"
cS = "KS"
intJump = 3
Else
cP = "X"
cS = "X"
intJump = 3
End If
Else
cP = "K"
cS = "K"
intJump = 2
End If
ElseIf arr(x+1) = "K" Or arr(x+1) = "G" Or arr(x+1) = "Q" Then
cP = "K"
cS = "K"
intJump = 2
ElseIf arr(x+1) = "I" Or arr(x+1) = "E" Or arr(x+1) = "Y" Then
If arr(x+1) & arr(x+2) = "IO" Or arr(x+1) & arr(x+2) = "IE" Or arr(x+1) & arr(x+2) = "IA" Then
cP = "S"
cS = "X"
intJump = 2
Else
cP = "S"
cS = "S"
intJump = 2
End If
Else
cP = "K"
cS = "K"
If arr(x+1) & arr(x+2) = " C" Or arr(x+1) & arr(x+2) = " Q" Or arr(x+1) & arr(x+2) = " G" Then
intJump = 3
Else
If (arr(x+1) = "C" Or arr(x+1) = "K" Or arr(x+1) = "Q") And _
arr(x+1) & arr(x+2) <> "CE" And arr(x+1) & arr(x+2) <> "CI" Then
intJump = 2
End If
End If
End If
Case "D"
If arr(x+1) = "G" Then
If arr(x+2) = "I" Or _
arr(x+2) = "E" Or _
arr(x+2) = "Y" Then
cP = "J"
cS = "J"
intJump = 3
Else
cP = "TK"
cS = "TK"
intJump = 2
End If
ElseIf arr(x+1) = "T" Then
cP = "T"
cS = "T"
intJump = 2
Else
cP = "T"
cS = "T"
End If
Case "G"
If arr(x+1) = "H" Then
If x <> iB And arr(x-1) <> "A" And arr(x-1) <> "E" And arr(x-1) <> "I" _
And arr(x-1) <> "O" And arr(x-1) <> "U" And arr(x-1) <> "Y" Then
cP = "K"
cS = "K"
intJump = 2
ElseIf (x > iB+1 And (arr(x-2) = "B" Or arr(x-2) = "H" Or arr(x-2) = "D")) Or _
(x > iB+2 And (arr(x-3) = "B" Or arr(x-3) = "H" Or arr(x-3) = "D")) Or _
(x > iB+3 And (arr(x-4) = "B" Or arr(x-4) = "H")) Then
cP = ""
cS = ""
intJump = 2
Else
If x > iB+2 And arr(x-1) = "U" And _
(arr(x-3) = "C" Or arr(x-3) = "G" Or arr(x-3) = "L" Or arr(x-3) = "R" Or arr(x-3) = "T") Then
cP = "F"
cS = "F"
intJump = 2
ElseIf x > iB And arr(x-1) <> "I" Then
cP = "K"
cS = "K"
intJump = 2
Else
cP = ""
cS = ""
End If
End If
ElseIf arr(x+1) = "N" Then
cS = "KN"
intJump = 2
If arr(x+2) & arr(x+3) <> "EY" And Not isSlavoGermanic Then
cP = "N"
Else
cP = "KN"
End If
ElseIf arr(x+1) & arr(x+2) = "LI" And Not isSlavoGermanic Then
cP = "KL"
cS = "L"
intJump = 2
ElseIf (arr(x+1) & arr(x+2) = "ER" Or arr(x+1) = "Y") And _
arr(x-1) <> "E" And arr(x-1) <> "I" And _
arr(x-1) & arr(x+1) <> "RY" And _
arr(x-1) & arr(x+1) <> "OY" And _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) & arr(iB+4) & arr(iB+5) <> "DANGER" And _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) & arr(iB+4) & arr(iB+5) <> "RANGER" And _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) & arr(iB+4) & arr(iB+5) <> "MANGER" Then
cP = "K"
cS = "J"
intJump = 2
ElseIf arr(x+1) = "E" Or arr(x+1) = "I" Or arr(x+1) = "Y" Or _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "AGGI" Or _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "OGGI" Then
If arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) = "VON " Or _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) = "VAN " Or _
arr(iB) & arr(iB+1) & arr(iB+2) = "SCH" Or _
arr(x+1) & arr(x+2) = "ET" Then
cP = "K"
cS = "K"
intJump = 2
Else
cP = "J"
If arr(x+1) & arr(x+2) & arr(x+3) & arr(x+4) = "IER " Then
cS = "J"
intJump = 3
Else
cS = "K"
intJump = 2
End If
End If
Else
cP = "K"
cS = "K"
End If
Case "H"
If (arr(x-1) = "A" Or _
arr(x-1) = "E" Or _
arr(x-1) = "I" Or _
arr(x-1) = "O" Or _
arr(x-1) = "U" Or _
arr(x-1) = "Y") And _
(arr(x+1) = "A" Or _
arr(x+1) = "E" Or _
arr(x+1) = "I" Or _
arr(x+1) = "O" Or _
arr(x+1) = "U" Or _
arr(x+1) = "Y") Then
intJump = 2
Else
cP = ""
cS = ""
End If
Case "J"
If arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) = "SAN " Then
cP = "H"
cS = "H"
Else
If Not isSlavoGermanic And ( _
arr(x-1) = "A" Or _
arr(x-1) = "E" Or _
arr(x-1) = "I" Or _
arr(x-1) = "O" Or _
arr(x-1) = "U" Or _
arr(x-1) = "Y") And ( _
arr(x+1) = "A" Or _
arr(x+1) = "O") Then
cS = "H"
Else
If x = intLength Then
cS = ""
Else
If arr(x+1) = "L" Or arr(x+1) = "T" Or arr(x+1) = "K" Or _
arr(x+1) = "S" Or arr(x+1) = "N" Or arr(x+1) = "M" Or _
arr(x+1) = "B" Or arr(x+1) = "Z" Or _
arr(x-1) = "S" Or arr(x-1) = "K" Or arr(x-1) = "L" Then
cP = ""
cS = ""
End If
End If
End If
End If
Case "L"
If arr(x+1) = "L" Then
intJump = 2
If ((x = intLength-2 And ( _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "ILLO" Or _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "ILLA" Or _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "ALLE" _
)) Or (( _
arr(intLength-1) & arr(intLength) = "AS" Or _
arr(intLength-1) & arr(intLength) = "OS" Or _
arr(intLength) = "A" Or arr(intLength) = "O") And _
arr(x-1) & arr(x) & arr(x+1) & arr(x+2) = "ALLE")) Then
cS = ""
End If
End If
Case "M"
If arr(x-1) & arr(x) & arr(x+1) = "UMB" And _
(x = intLength-1 Or arr(x+2) & arr(x+3) = "ER") Then
intJump = 2
End If
Case "P"
Select Case arr(x+1)
Case "H"
cP = "F"
cS = "F"
intJump = 2
Case "B"
intJump = 2
End Select
Case "Q"
cP = "K"
cS = "K"
Case "R"
If x = intLength And Not isSlavoGermanic And _
arr(x-2) & arr(x-1) = "IE" And _
arr(x-4) & arr(x-3) <> "ME" And _
arr(x-4) & arr(x-3) <> "MA" Then
cP = ""
End If
Case "S"
If arr(x+1) = "L" And (arr(x-1) = "I" Or arr(x-1) = "Y") Then
cP = ""
cS = ""
ElseIf arr(x+1) = "H" And _
arr(x+2) & arr(x+3) & arr(x+4) <> "EIM" And _
arr(x+2) & arr(x+3) & arr(x+4) <> "OEK" And _
arr(x+2) & arr(x+3) & arr(x+4) <> "OLM" And _
arr(x+2) & arr(x+3) & arr(x+4) <> "OLZ" Then
intJump = 2
cP = "X"
cS = "X"
ElseIf Not isSlavoGermanic And ( _
arr(x+1) & arr(x+2) = "IA" Or _
arr(x+1) & arr(x+2) = "IO") Then
intJump = 3
cS = "X"
ElseIf arr(x+1) = "Z" Then
cS = "X"
intJump = 2
ElseIf arr(x+1) = "C" Then
intJump = 3
If arr(x+2) = "H" Then
If arr(x+3) & arr(x+4) = "OO" Or _
arr(x+3) & arr(x+4) = "ER" Or _
arr(x+3) & arr(x+4) = "EN" Or _
arr(x+3) & arr(x+4) = "UY" Or _
arr(x+3) & arr(x+4) = "ED" Or _
arr(x+3) & arr(x+4) = "EM" Then
cS = "SK"
If arr(x+3) & arr(x+4) = "ER" Or _
arr(x+3) & arr(x+4) = "EN" Then
cP = "X"
Else
cP = "SK"
End If
Else
cP = "X"
If x <> iB Or arr(iB+3) = "W" Or arr(iB+3) = "A" Or _
arr(iB+3) = "E" Or arr(iB+3) = "I" Or arr(iB+3) = "O" Or _
arr(iB+3) = "U" Or arr(iB+3) = "Y" Then
cS = "X"
End If
End If
ElseIf arr(x+2) = "I" Or arr(x+2) = "E" Or arr(x+2) = "Y" Then
Else
cP = "SK"
cS = "SK"
End If
ElseIf x = intLength And arr(x-1) = "I" And ( _
arr(x-2) = "A" Or arr(x-2) = "O") Then
cP = ""
End If
Case "T"
If arr(x+1) & arr(x+2) & arr(x+3) = "ION" _
Or arr(x+1) & arr(x+2) = "IA" _
Or arr(x+1) & arr(x+2) = "CH" Then
cP = "X"
cS = "X"
intJump = 3
ElseIf (arr(x+1) = "H" Or arr(x+1) & arr(x+2) = "TH") And _
(arr(x+2) & arr(x+3) <> "OM" And _
arr(x+2) & arr(x+3) <> "AM" And _
arr(iB) & arr(iB+1) & arr(iB+2) <> "SCH" And _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) <> "VAN " And _
arr(iB) & arr(iB+1) & arr(iB+2) & arr(iB+3) <> "VON ") Then
cP = "0"
intJump = 2
ElseIf arr(x+1) = "D" Then
intJump = 2
End If
Case "V"
cP = "F"
cS = "F"
Case "W"
If arr(x+1) = "R" Then
cP = "R"
cS = "R"
intJump = 2
ElseIf arr(iB) & arr(iB+1) & arr(iB+2) = "SCH" _
Or (x = intLength And ( _
arr(x-1) = "A" Or _
arr(x-1) = "E" Or _
arr(x-1) = "I" Or _
arr(x-1) = "O" Or _
arr(x-1) = "U" Or _
arr(x-1) = "Y")) _
Or ((arr(x-1) = "E" Or arr(x-1) = "O") And _
(arr(x+1) & arr(x+2) & arr(x+3) = "SKI" Or _
arr(x+1) & arr(x+2) & arr(x+3) = "SKY")) Then
cP = ""
cS = "F"
ElseIf arr(x+1) & arr(x+2) & arr(x+3) = "ICZ" _
Or arr(x+1) & arr(x+2) & arr(x+3) = "ITZ" Then
cP = "TS"
cS = "FX"
intJump = 4
Else
cP = ""
cS = ""
End If
Case "X"
If x = intLength And _
(arr(x-3) & arr(x-2) & arr(x-1) = "IAU" Or _
arr(x-3) & arr(x-2) & arr(x-1) = "EAU" Or _
arr(x-2) & arr(x-1) = "AU" Or _
arr(x-2) & arr(x-1) = "OU") Then
cP = ""
cS = ""
Else
cP = "KS"
cS = "KS"
End If
If arr(x+1) = "C" Then
intJump = 2
End If
Case "Z"
If arr(x+1) = "H" Then
cP = "J"
cS = "J"
ElseIf (arr(x+1) & arr(x+2) = "ZO" Or _
arr(x+1) & arr(x+2) = "ZI" Or _
arr(x+1) & arr(x+2) = "ZA") _
Or (isSlavoGermanic And x <> iB And arr(x-1) = "T") Then
cP = "S"
cS = "TS"
Else
cP = "S"
cS = "S"
End If
End Select
strPrimary = strPrimary & cP
strSecondary = strSecondary & cS
If arr(x) = arr(x+1) And arr(x) <> "C" Then
intJump = intJump + 1
End If
x = x + intJump
Loop
For i = 1 To intThreshhold
strPrimary = strPrimary & " "
strSecondary = strSecondary & " "
Next
DoubleMetaphone = Left(strPrimary, intThreshhold) & Left(strSecondary, intThreshhold)
End Function
From: https://bytes.com/topic/access/insights/965241-fuzzy-string-matching-double-metaphone-algorithm
本文探讨了在处理自由格式语言数据时遇到的名称匹配错误问题,特别是拼写错误带来的挑战。文章介绍了Double Metaphone算法,这是一种语音算法,能够对字符串进行编码以反映其口语发音。该算法考虑了多种语言的发音规则,用于产生两个可能的发音编码,以适应英语及其他语言中的发音差异。算法的阈值参数决定了发音的最大长度,影响匹配效果。这个实现允许用户根据需求调整阈值,以提高模糊匹配的准确性。
3374

被折叠的 条评论
为什么被折叠?



