/Files/rocketfan/editdistance_readme.pdf
先给一个例子,两个字符串eeba和abca相似度是多少呢,edit distance是一个很好的度量,定义从字符串a变到字符串b,所需要的最少的操作步骤(插入,删除,更改)为两个字符串之间的编辑距离。
对于eeba,abca它们之间的编辑距离为3,可以按照上面的操作步骤(不是唯一的)将eeba变到abca,1.将e变为a 2.删除e 3.添加c 共3个步骤。
典型的动态规划问题。
EDIT[i,j]表示对于字符串a从1到i的子串和字符串b从1到j的字串的编辑距离。(字符串下标从1开始)
EDIT[i - 1,j] + 1表示对a 在i 位置删除delete操作
EDIT[i,j - 1] + 1 表示insert添加操作
EDIT[i-1, j - 1] + f(x[i],y[j])这里如果x[i] == y[j] 则 f(x[i],y[j]) == 0 否则 ==1
表示不变或者是modify操作。
如果需要记录编辑过程如第一幅图所示,需要用二维数组记录下动态规划过程的路径信息,即记录下前一步骤的位置索引信息。
如下图
//edit_distance.h
1
/*
*
2 * \file edit_distance.h
3 * \author pku_goldenlock
4 * \date 2009-8-10
5 */
6
7
8 #ifndef _EDIT_DISTANCE_H
9 #define _EDIT_DISTANCE_H
10 #include < string >
11 using std:: string ;
12
13 class EditDistanceHelp {
14 private :
15 /* *
16 * Define the array data type as ArrayData
17 */
18 struct ArrayData {
19 int dist; /* *< The min edit distance until current pos */
20 int pre_x; /* *< Store the previous postion, x part horizontal */
21 int pre_y; /* *< Store the previous postion, y part vertical */
22 };
23 public :
24 /* *
25 * Find the min edit distance and return the edit distance.
26 * will sotre the best path info in string rs1, rs2
27 * s1, s2 is the user given string for caculating the edit distance
28 * @param s1,s2 Input strings.
29 * @param rs1,rs2 Store the result path.
30 * @return Will return the edit distance for string s1 and s2.
31 */
32 static int CalcPath( const string & s1, const string & s2, string & rs1, string & rs2);
33 /* *
34 * Find the min edit distance only do not need path info.
35 */
36 static int EditDistance( const string & s1, const string & s2);
37 private :
38 /* *
39 * Set all data members value for one array element.
40 */
41 static void SetArrayData(ArrayData & a, int dist, int pre_x, int pre_y);
42 /* *
43 * Find the best path and store result to string rs1 and rs2.
44 * Recursion function.
45 * @param array The array will store all the info of each positon (x, y).
46 * @param index_x Current postion x part.
47 * @param index_y Current postion y part.
48 * @param s1,s2 The input two string for caculating their best edit distance path.
49 * @param rs1,rs2 Will store the result.
50 */
51 static void StoreResult(ArrayData ** array, int index_x, int index_y,
52 const string & s1, const string & s2,
53 string & rs1, string & rs2);
54 };
55
56
57 #endif // end of define _EDIT_DISTANCE_H
2 * \file edit_distance.h
3 * \author pku_goldenlock
4 * \date 2009-8-10
5 */
6
7
8 #ifndef _EDIT_DISTANCE_H
9 #define _EDIT_DISTANCE_H
10 #include < string >
11 using std:: string ;
12
13 class EditDistanceHelp {
14 private :
15 /* *
16 * Define the array data type as ArrayData
17 */
18 struct ArrayData {
19 int dist; /* *< The min edit distance until current pos */
20 int pre_x; /* *< Store the previous postion, x part horizontal */
21 int pre_y; /* *< Store the previous postion, y part vertical */
22 };
23 public :
24 /* *
25 * Find the min edit distance and return the edit distance.
26 * will sotre the best path info in string rs1, rs2
27 * s1, s2 is the user given string for caculating the edit distance
28 * @param s1,s2 Input strings.
29 * @param rs1,rs2 Store the result path.
30 * @return Will return the edit distance for string s1 and s2.
31 */
32 static int CalcPath( const string & s1, const string & s2, string & rs1, string & rs2);
33 /* *
34 * Find the min edit distance only do not need path info.
35 */
36 static int EditDistance( const string & s1, const string & s2);
37 private :
38 /* *
39 * Set all data members value for one array element.
40 */
41 static void SetArrayData(ArrayData & a, int dist, int pre_x, int pre_y);
42 /* *
43 * Find the best path and store result to string rs1 and rs2.
44 * Recursion function.
45 * @param array The array will store all the info of each positon (x, y).
46 * @param index_x Current postion x part.
47 * @param index_y Current postion y part.
48 * @param s1,s2 The input two string for caculating their best edit distance path.
49 * @param rs1,rs2 Will store the result.
50 */
51 static void StoreResult(ArrayData ** array, int index_x, int index_y,
52 const string & s1, const string & s2,
53 string & rs1, string & rs2);
54 };
55
56
57 #endif // end of define _EDIT_DISTANCE_H
//edit_distance.cc
1
#include
"
edit_distance.h
"
2 #include < iostream >
3 #include < iomanip >
4 using namespace std;
5 /* *
6 * find the min edit distance and return the edit distance
7 * will sotre the best path info in string rs1, rs2
8 * s1, s2 is the user given string for caculating the edit distance
9 */
10 int EditDistanceHelp::CalcPath( const string & s1, const string & s2, string & rs1, string & rs2)
11 {
12 // first find min dist and store path info
13 int len1 = s1.length();
14 int len2 = s2.length();
15
16 // allocate space for array
17 ArrayData ** array;
18 array = new ArrayData * [len1 + 1 ];
19 for ( int i = 0 ; i <= len1; i ++ )
20 array[i] = new ArrayData[len2 + 1 ];
21
22 // kernal for finding the best path and store path info to array
23 for ( int i = 0 ; i <= len1; i ++ )
24 SetArrayData(array[i][ 0 ], i, i - 1 , 0 );
25 for ( int j = 0 ; j <= len2; j ++ )
26 SetArrayData(array[ 0 ][j], j, 0 , j - 1 );
27 int min_dist;
28 for ( int i = 1 ; i <= len1; i ++ )
29 for ( int j = 1 ; j <= len2; j ++ ) {
30 if (array[i - 1 ][j].dist < array[i][j - 1 ].dist) // can also be <=
31 SetArrayData(array[i][j], array[i - 1 ][j].dist + 1 , i - 1 , j);
32 else
33 SetArrayData(array[i][j], array[i][j - 1 ].dist + 1 , i, j - 1 );
34 min_dist = array[i - 1 ][j - 1 ].dist + (s1[i - 1 ] != s2[j - 1 ]);
35 if (min_dist < array[i][j].dist) // < is OK but <= make modify high priority
36 SetArrayData(array[i][j], min_dist, i - 1 , j - 1 );
37 }
38
39 // store the best path result to two result string rs1 and rs2
40 StoreResult(array, len1, len2, s1, s2, rs1, rs2);
41 min_dist = array[len1][len2].dist;
42
43 // print array
44 for ( int i = 0 ; i <= len1; i ++ ) {
45 for ( int j = 0 ; j <= len2; j ++ ) {
46 cout << " ( " << array[i][j].pre_x << " , " << setw( 2 ) << array[i][j].pre_y << " ) " ;
47 }
48 cout << endl;
49 }
50 // free resources of array
51 for ( int i = 0 ; i <= len1; i ++ )
52 delete array[i];
53 delete array;
54
55 // return min edit distance
56 return min_dist;
57 }
58
59 /* *
60 * find the min edit distance only do not need path info
61 */
62 int EditDistanceHelp::EditDistance( const string & s1, const string & s2)
63 {
64 using std::min;
65 int len1 = s1.length();
66 int len2 = s2.length();
67 int array[len1 + 1 ][len2 + 1 ];
68 for ( int i = 0 ; i <= len1; i ++ )
69 array[i][ 0 ] = i;
70 for ( int j = 1 ; j <= len2; j ++ )
71 array[ 0 ][j] = j;
72 for ( int i = 1 ; i <= len1; i ++ )
73 for ( int j = 1 ; j <= len2; j ++ )
74 array[i][j] = min(min(array[i - 1 ][j] + 1 , array[i][j - 1 ] + 1 ),
75 array[i - 1 ][j - 1 ] + (s1[i - 1 ] != s2[j - 1 ]));
76 return array[len1][len2];
77 }
78
79 /* *
80 * Set all data members value for one array element
81 */
82 void EditDistanceHelp::SetArrayData(ArrayData & a, int dist, int pre_x, int pre_y)
83 {
84 a.dist = dist;
85 a.pre_x = pre_x;
86 a.pre_y = pre_y;
87 }
88
89 /* *
90 * Based on the path info stored in array ,find the best path and store result to string rs1 and rs2
91 */
92 void EditDistanceHelp::StoreResult(ArrayData ** array, int index_x, int index_y,
93 const string & s1, const string & s2,
94 string & rs1, string & rs2)
95 {
96 if (index_x == 0 && index_y == 0 )
97 return ;
98
99 if ((array[index_x][index_y].pre_x < index_x) && (array[index_x][index_y].pre_y < index_y)) {
100 StoreResult(array, index_x - 1 , index_y - 1 , s1, s2, rs1, rs2);
101 rs1 += s1[index_x - 1 ];
102 rs2 += s2[index_y - 1 ];
103 } else if (array[index_x][index_y].pre_x < index_x) {
104 StoreResult(array, index_x - 1 , index_y, s1, s2, rs1, rs2);
105 rs1 += s1[index_x - 1 ];
106 rs2 += ' - ' ;
107 } else {
108 StoreResult(array, index_x, index_y - 1 , s1, s2, rs1, rs2);
109 rs1 += ' - ' ;
110 rs2 += s2[index_y - 1 ];
111 }
112 }
113
2 #include < iostream >
3 #include < iomanip >
4 using namespace std;
5 /* *
6 * find the min edit distance and return the edit distance
7 * will sotre the best path info in string rs1, rs2
8 * s1, s2 is the user given string for caculating the edit distance
9 */
10 int EditDistanceHelp::CalcPath( const string & s1, const string & s2, string & rs1, string & rs2)
11 {
12 // first find min dist and store path info
13 int len1 = s1.length();
14 int len2 = s2.length();
15
16 // allocate space for array
17 ArrayData ** array;
18 array = new ArrayData * [len1 + 1 ];
19 for ( int i = 0 ; i <= len1; i ++ )
20 array[i] = new ArrayData[len2 + 1 ];
21
22 // kernal for finding the best path and store path info to array
23 for ( int i = 0 ; i <= len1; i ++ )
24 SetArrayData(array[i][ 0 ], i, i - 1 , 0 );
25 for ( int j = 0 ; j <= len2; j ++ )
26 SetArrayData(array[ 0 ][j], j, 0 , j - 1 );
27 int min_dist;
28 for ( int i = 1 ; i <= len1; i ++ )
29 for ( int j = 1 ; j <= len2; j ++ ) {
30 if (array[i - 1 ][j].dist < array[i][j - 1 ].dist) // can also be <=
31 SetArrayData(array[i][j], array[i - 1 ][j].dist + 1 , i - 1 , j);
32 else
33 SetArrayData(array[i][j], array[i][j - 1 ].dist + 1 , i, j - 1 );
34 min_dist = array[i - 1 ][j - 1 ].dist + (s1[i - 1 ] != s2[j - 1 ]);
35 if (min_dist < array[i][j].dist) // < is OK but <= make modify high priority
36 SetArrayData(array[i][j], min_dist, i - 1 , j - 1 );
37 }
38
39 // store the best path result to two result string rs1 and rs2
40 StoreResult(array, len1, len2, s1, s2, rs1, rs2);
41 min_dist = array[len1][len2].dist;
42
43 // print array
44 for ( int i = 0 ; i <= len1; i ++ ) {
45 for ( int j = 0 ; j <= len2; j ++ ) {
46 cout << " ( " << array[i][j].pre_x << " , " << setw( 2 ) << array[i][j].pre_y << " ) " ;
47 }
48 cout << endl;
49 }
50 // free resources of array
51 for ( int i = 0 ; i <= len1; i ++ )
52 delete array[i];
53 delete array;
54
55 // return min edit distance
56 return min_dist;
57 }
58
59 /* *
60 * find the min edit distance only do not need path info
61 */
62 int EditDistanceHelp::EditDistance( const string & s1, const string & s2)
63 {
64 using std::min;
65 int len1 = s1.length();
66 int len2 = s2.length();
67 int array[len1 + 1 ][len2 + 1 ];
68 for ( int i = 0 ; i <= len1; i ++ )
69 array[i][ 0 ] = i;
70 for ( int j = 1 ; j <= len2; j ++ )
71 array[ 0 ][j] = j;
72 for ( int i = 1 ; i <= len1; i ++ )
73 for ( int j = 1 ; j <= len2; j ++ )
74 array[i][j] = min(min(array[i - 1 ][j] + 1 , array[i][j - 1 ] + 1 ),
75 array[i - 1 ][j - 1 ] + (s1[i - 1 ] != s2[j - 1 ]));
76 return array[len1][len2];
77 }
78
79 /* *
80 * Set all data members value for one array element
81 */
82 void EditDistanceHelp::SetArrayData(ArrayData & a, int dist, int pre_x, int pre_y)
83 {
84 a.dist = dist;
85 a.pre_x = pre_x;
86 a.pre_y = pre_y;
87 }
88
89 /* *
90 * Based on the path info stored in array ,find the best path and store result to string rs1 and rs2
91 */
92 void EditDistanceHelp::StoreResult(ArrayData ** array, int index_x, int index_y,
93 const string & s1, const string & s2,
94 string & rs1, string & rs2)
95 {
96 if (index_x == 0 && index_y == 0 )
97 return ;
98
99 if ((array[index_x][index_y].pre_x < index_x) && (array[index_x][index_y].pre_y < index_y)) {
100 StoreResult(array, index_x - 1 , index_y - 1 , s1, s2, rs1, rs2);
101 rs1 += s1[index_x - 1 ];
102 rs2 += s2[index_y - 1 ];
103 } else if (array[index_x][index_y].pre_x < index_x) {
104 StoreResult(array, index_x - 1 , index_y, s1, s2, rs1, rs2);
105 rs1 += s1[index_x - 1 ];
106 rs2 += ' - ' ;
107 } else {
108 StoreResult(array, index_x, index_y - 1 , s1, s2, rs1, rs2);
109 rs1 += ' - ' ;
110 rs2 += s2[index_y - 1 ];
111 }
112 }
113