向量空间模型文档相似度计算实现(C#)

本文介绍了一种基于向量空间模型(VSM)的文本相似度计算方法,并提供了具体的C#实现代码。该方法通过统计文档的词频并利用余弦相似度公式来计算两篇文档之间的相似度。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

读者可以根据自己的需要进行加壳或改写,本文权当抛砖引玉。

笔者加的壳在:

http://download.youkuaiyun.com/source/1143450

VSM模型介绍:

http://blog.youkuaiyun.com/Felomeng/archive/2009/03/25/4024078.aspx

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

namespace Felomeng.VSMSimilarity

{

class SVMModle

{

/// <summary>

/// 降维词表

/// </summary>

private List<string> reducingKeys = new List<string>();

/// <summary>

/// 构造函数:使用降维表

/// </summary>

/// <param name="reducingKeys">降维词表</param>

public SVMModle(List<string> reducingKeys)

{

this.reducingKeys = reducingKeys;

}

/// <summary>

/// 构造函数:不使用降维表

/// </summary>

public SVMModle()

{

}

/// <summary>

/// 相似度计算

/// </summary>

/// <param name="text1">文档1(分好词的,分词符为非汉字字符)</param>

/// <param name="text2">文档2(分好词的,分词符为非汉字字符)</param>

/// <returns>两篇文章的相似度</returns>

public double Similarity(string text1, string text2)

{

double similarity = 0.0, numerator = 0.0, denominator1 = 0.0, denominator2 = 0.0;

int temp1, temp2;

Dictionary<string, int> dictionary1 = GetDictionary(text1);

Dictionary<string, int> dictionary2 = GetDictionary(text2);

if ((dictionary1.Count < 1) || (dictionary2.Count < 1))//如果任一篇文章中不含有汉字

{

return 0.0;

}

Dictionary<string, int>.KeyCollection keys1 = dictionary1.Keys;

foreach (string key in keys1)

{

dictionary1.TryGetValue(key, out temp1);

if (!dictionary2.TryGetValue(key, out temp2))

{

temp2 = 0;

}

dictionary2.Remove(key);

numerator += temp1 * temp2;

denominator1 += temp1 * temp1;

denominator2 += temp2 * temp2;

}

Dictionary<string, int>.KeyCollection keys2 = dictionary2.Keys;

foreach (string key in keys2)

{

dictionary2.TryGetValue(key, out temp2);

denominator2 += temp2 * temp2;

}

similarity = numerator / (Math.Sqrt(denominator1 * denominator2));

return similarity;

}

/// <summary>

/// 相似度计算

/// </summary>

/// <param name="text1">第一篇文档的词频词典</param>

/// <param name="text2">第二篇文档的词频词典</param>

/// <returns>两篇文档的相似度</returns>

public double Similarity(Dictionary<string, int> text1, Dictionary<string, int> text2)

{

double similarity = 0.0, numerator = 0.0, denominator1 = 0.0, denominator2 = 0.0;

int temp1, temp2;

Dictionary<string, int> dictionary1 = new Dictionary<string,int>( text1);

Dictionary<string, int> dictionary2 = new Dictionary<string,int>( text2);

if ((dictionary1.Count < 1) || (dictionary2.Count < 1))//如果任一篇文章中不含有汉字

{

return 0.0;

}

Dictionary<string, int>.KeyCollection keys1 = dictionary1.Keys;

foreach (string key in keys1)

{

dictionary1.TryGetValue(key, out temp1);

if (!dictionary2.TryGetValue(key, out temp2))

{

temp2 = 0;

}

dictionary2.Remove(key);

numerator += temp1 * temp2;

denominator1 += temp1 * temp1;

denominator2 += temp2 * temp2;

}

Dictionary<string, int>.KeyCollection keys2 = dictionary2.Keys;

foreach (string key in keys2)

{

dictionary2.TryGetValue(key, out temp2);

denominator2 += temp2 * temp2;

}

similarity = numerator / (Math.Sqrt(denominator1 * denominator2));

return similarity;

}

/// <summary>

/// 统计文档词频词典

/// </summary>

/// <param name="text">已分词文档,分隔符为非汉语字符</param>

/// <returns>该文档词频词典</returns>

public Dictionary<string, int> GetDictionary(string text)

{

Dictionary<string, int> dictionary = new Dictionary<string, int>();

Regex regex = new Regex(@"[/u4e00-/u9fa5]+");

MatchCollection results = regex.Matches(text);

int temp;

foreach (Match word in results)

{

if (dictionary.TryGetValue(word.Value, out temp))

{

temp++;

dictionary.Remove(word.Value);

dictionary.Add(word.Value, temp);

}

else

{

dictionary.Add(word.Value, 1);

}

}

return dictionary;

}

}

}

还有很多可以优化的地方,大家多加思考。如果能够得到适当优化的话,速度还能提高很多。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值