比较单词相似度的一个算法(c#)

本文介绍了一个基于Levenshtein距离算法的字符串相似度计算方法,并提供了一个从Java移植到C#的具体实现示例。该算法可以用于比较两个字符串之间的相似程度,通过计算编辑距离来得出相似度。

    正好这几天在关注这块,看到了别人的一个java实现(http://www.360doc.com/content/090201/10/96202_2430993.html),边改成了c#版本的,使用了一下,还可以,符合我的要求。

using System;
using System.Collections.Generic;
using System.Text;

namespace test
{
    public class Similarity
    {
        private int min(int one, int two, int three)
        {
            int min = one;
            if (two < min)
            {
                min = two;
            }
            if (three < min)
            {
                min = three;
            }
            return min;
        }

        public int levDistance(String str1, String str2)
        {
            int n = str1.Length;
            int m = str2.Length;
            int i;    //遍历str1的
            int j;    //遍历str2的
            char ch1;    //str1的
            char ch2;    //str2的
            int temp;    //记录相同字符,在某个矩阵位置值的增量,不是0就是1
            if (n == 0)
            {
                return m;
            }
            if (m == 0)
            {
                return n;
            }
            int[][] d = new int[n + 1][]; //矩阵

            for (i = 0; i <= n; i++)
            {    //初始化第一列
                d[i] = new int[m + 1];
                d[i][0] = i;
            }
            for (j = 0; j <= m; j++)
            {    //初始化第一行
                d[0][j] = j;
            }
            for (i = 1; i <= n; i++)
            {    //遍历str1
                ch1 = str1[i - 1];
                //去匹配str2
                for (j = 1; j <= m; j++)
                {
                    ch2 = str2[j - 1];
                    if (ch1 == ch2)
                    {
                        temp = 0;
                    }
                    else
                    {
                        temp = 1;
                    }
                    //左边+1,上边+1, 左上角+temp取最小
                    d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + temp);
                }
            }
            return d[n][m];
        }
        public double similarity(String str1, String str2)
        {
            int ld = levDistance(str1, str2);
            return 1 - (double)ld / Math.Max(str1.Length, str2.Length);
        }

        static void Main(string[] args)
        {
            Similarity s = new Similarity();
            String str11 = "expert";
            String str22 = "except";
            Console.WriteLine("ld=" + s.levDistance(str11, str22));
            Console.WriteLine("sim=" + s.similarity(str11, str22));
        }
    }
}

转载于:https://www.cnblogs.com/xioxu/archive/2009/10/25/1589645.html

namespace ServiceRanking { /// <summary> /// Summary description for TF_IDFLib. /// </summary> public class TFIDFMeasure { private string[] _docs; private string[][] _ngramDoc; private int _numDocs=0; private int _numTerms=0; private ArrayList _terms; private int[][] _termFreq; private float[][] _termWeight; private int[] _maxTermFreq; private int[] _docFreq; public class TermVector { public static float ComputeCosineSimilarity(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFER LENGTH"); float denom=(VectorLength(vector1) * VectorLength(vector2)); if (denom == 0F) return 0F; else return (InnerProduct(vector1, vector2) / denom); } public static float InnerProduct(float[] vector1, float[] vector2) { if (vector1.Length != vector2.Length) throw new Exception("DIFFER LENGTH ARE NOT ALLOWED"); float result=0F; for (int i=0; i < vector1.Length; i++) result += vector1[i] * vector2[i]; return result; } public static float VectorLength(float[] vector) { float sum=0.0F; for (int i=0; i < vector.Length; i++) sum=sum + (vector[i] * vector[i]); return (float)Math.Sqrt(sum); } } private IDictionary _wordsIndex=new Hashtable() ; public TFIDFMeasure(string[] documents) { _docs=documents; _numDocs=documents.Length ; MyInit(); } private void GeneratNgramText() { } private ArrayList GenerateTerms(string[] docs) { ArrayList uniques=new ArrayList() ; _ngramDoc=new string[_numDocs][] ; for (int i=0; i < docs.Length ; i++) { Tokeniser tokenizer=new Tokeniser() ; string[] words=tokenizer.Partition(docs[i]); for (int j=0; j < words.Length ; j++) if (!uniques.Contains(words[j]) ) uniques.Add(words[j]) ; } return uniques; } private static object
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值