文本字面相似度算法

# 编辑距离
def edit_distance(word1, word2):
    len1 = len(word1)
    len2 = len(word2)
    dp = np.zeros((len1 + 1,len2 + 1))
    for i in range(len1 + 1):
        dp[i][0] = i    
    for j in range(len2 + 1):
        dp[0][j] = j
     
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            delta = 0 if word1[i-1] == word2[j-1] else 1
            dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i-1][j] + 1, dp[i][j - 1] + 1))
    
    return dp[len1][len2]
# 全局序列对齐 Needleman-Wunsch
def globalAlignment(str1, str2):
    m = len(str1)
    n = len(str2)
    f = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 and j == 0:
                pass
            elif i == 0:
                f[i][j] = f[i][j - 1] - 2
            elif j == 0:
                f[i][j] = f[i - 1][j] - 2
            else:
                temp = 1 if str1[i - 1] == str2[j - 1] else -1
                f[i][j] = max(f[i - 1][j] - 2, f[i][j - 1] - 2, f[i - 1][j - 1] + temp)
    return f[m][n] 
# 局部序列对齐 Smith-Waterman
def localAlignment(str1, str2):
    m = len(str1)
    n = len(str2)
    f = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 and j == 0:
                pass
            elif i == 0:
                f[i][j] = max(f[i][j - 1] - 2, 0)
            elif j == 0:
                f[i][j] = max(f[i - 1][j] - 2, 0)
            else:
                temp = 1 if str1[i - 1] == str2[j - 1] else -1
                f[i][j] = max(f[i - 1][j] - 2, f[i][j - 1] - 2, f[i - 1][j - 1] + temp, 0)
    return max([max(ele) for ele in f])
                
# 最长公共子串 
def longestCommonString(A, B):
    m = len(A)
    n = len(B)
    f = [[0 for _ in range(n + 1)] for _ in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if A[i - 1] == B[j - 1]:
                f[i][j] = f[i - 1][j - 1] + 1
            else:
                f[i][j] = 0
    return max([max(ele) for ele in f])
# 最长的公共子序列 LCS
def find_lcseque(s1, s2):
     # 生成字符串长度加1的0矩阵,m用来保存对应位置匹配的结果
    m = [ [ 0 for x in range(len(s2)+1) ] for y in range(len(s1)+1) ]
    # d用来记录转移方向
    d = [ [ None for x in range(len(s2)+1) ] for y in range(len(s1)+1) ]

    for p1 in range(len(s1)):
        for p2 in range(len(s2)):
            if s1[p1] == s2[p2]:            #字符匹配成功,则该位置的值为左上方的值加1
                m[p1+1][p2+1] = m[p1][p2]+1
                d[p1+1][p2+1] = 'ok'
            elif m[p1+1][p2] > m[p1][p2+1]:  #左值大于上值,则该位置的值为左值,并标记回溯时的方向
                m[p1+1][p2+1] = m[p1+1][p2]
                d[p1+1][p2+1] = 'left'
            else:                           #上值大于左值,则该位置的值为上值,并标记方向up
                m[p1+1][p2+1] = m[p1][p2+1]
                d[p1+1][p2+1] = 'up'           
    (p1, p2) = (len(s1), len(s2))
    s = []
    while m[p1][p2]:    #不为None时
        c = d[p1][p2]
        if c == 'ok':   #匹配成功,插入该字符,并向左上角找下一个
            s.append(s1[p1-1])
            p1-=1
            p2-=1
        if c =='left':  #根据标记,向左找下一个
            p2 -= 1
        if c == 'up':   #根据标记,向上找下一个
            p1 -= 1
    s.reverse()
    return len(s)

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值