最长公共子串LCS--之动态规划

最新推荐文章于 2021-02-21 12:17:45 发布

Qyee16

最新推荐文章于 2021-02-21 12:17:45 发布

阅读量922

点赞数

分类专栏：算法 LCS

本文链接：https://blog.youkuaiyun.com/Qyee16/article/details/7773656

版权

算法 LCS 专栏收录该内容

0 篇文章

订阅专栏

最长公共子串--longest common Subsequence，即两个字符串中连续字符串的的公共子串。

描述：

给定两个字符串X=<x1, x2, … , xm>， Y=<y1, y2, ... , yn>, 假设公共子串Z=<z1, z2, ... zk> 满足：

如果xm = yn，那么zk = xm = yn而且Zk-1是Xm-1和Yn-1的一个LCS
如果xm ≠ yn，那么zk ≠ xm → Z是Xm-1和Y的一个LCS
如果xm ≠ yn，那么zk ≠ yn → Z是X和Yn-1的一个LCS

上面说明X，Y的LCS子串具有最优子结构。递推公式如下：设c[i,j]为最长子串长度

递推公式

过程：

1、把两个字符串一个横排放置，一个纵排放置。如字符串longstr=“acebb”, Y=“acttacebd”，此时形成了一个矩阵arr【】【】。

2、当longStr【i】== shortStr【j】，时然后我们就可以在横、竖的矩阵中相应位置arr【i】【j】位置置1，如图：

矩阵中对应位置置1

看这上图，我们想要求最长子串，我们要做的：

第一步：遍历长、短字符串，然后在矩阵数组中相同位置置1

第二步：然后数斜线长度，最长即为结果。

代码如下：

#include <string>
#include <assert.h>

using namespace std;

char* LCS(const char* shortStr, const char* longStr);

int _tmain(int argc, _TCHAR* argv[])
{
	const char* shortStr = "acettab";
	const char* longStr  = "acettabfaceabcccctaf";

	char* result = LCS(shortStr, longStr);

	printf("longest common substring is: %s", result);

	getchar();  //在使用vs2008 vs2010时，使程序暂停下来
	return 0;
}

char* LCS(const char* shortStr, const char* longStr)
{
	assert(shortStr && longStr);

	int maxlen = 0;
	int row = 0;
        //计算两字符串长度
	unsigned slen = strlen( shortStr );
	unsigned llen = strlen( longStr );
        //定义二维数组，即矩阵
	unsigned **arr = NULL;

        //初始化二维矩阵
	arr = (unsigned ** ) malloc ( llen * sizeof(unsigned*));
	memset(arr, 0, llen * sizeof(unsigned*));
	for(unsigned i = 0; i < llen; ++i)
	{
		arr[i] = (unsigned *) malloc(slen * sizeof(unsigned));
		memset(arr[i], 0, slen * sizeof(unsigned));
	}

        //如上图，比较横、竖串，当字符相等时，矩阵对应位置置1，此时记录最长子串长度，赋值给maxlen，并记录最长子串结束的结束位置row
        //row值方便计算子串
      for(unsigned i = 0; i < llen ; ++i)
	{
		for (unsigned j = 0; j < slen; ++j)
		{
			if(shortStr[j] == longStr[i])
			{
				if (i == 0 || j == 0)
				{
					arr[i][j] = 1;
				}
				else
					arr[i][j] = arr[i-1][j-1] + 1;
			}
			if (arr[i][j] > maxlen)
			{
				maxlen = arr[i][j];	
				row = i + 1;
			}
		}
	}
       //释放资源
	for(int i = 0; i < slen; ++i)
		free(arr[i]);
	free(arr);

	char *lcsResult = NULL;
	lcsResult = (char*) malloc (maxlen * sizeof(char) + 1);
	memset(lcsResult, 0, maxlen + 1);
	int t;
	int tmp = maxlen;
	for (t = 0; t < maxlen; ++t)
	{
		*(lcsResult + t) = *(longStr + row - tmp);//计算最长子串
		tmp -= 1;
	}
	lcsResult[t] = '\0';
	return lcsResult;
}

时间复杂度：O（m*n），空间复杂度O（m*n）

图2：计数递增的置数

改进：

正如上面代码所示，使用了二维矩阵记录每个相等的位置和子串的长度，空间复杂度为o(m*n)。从递推公式得知中我们计算最长子串的下一个连续位置时，即C【i,j】时使用的仅仅和C【i-1，j】和C【i, j-1】,或者C【i-1， j-1】有关。那我们可以可以仅仅使用C【i-1，j】，C【i，j】两行即可。

代码如下：

#include <string>
#include <assert.h>

using namespace std;

char* LCS(const char* shortStr, const char* longStr);

int _tmain(int argc, _TCHAR* argv[])
{
	const char* shortStr = "a2cett4ab";
	const char* longStr  = "aceabfacett3abcccctaf";

	char* result = LCS(shortStr, longStr);

	printf("longest common substring is: %s", result);

	getchar();
	return 0;
}

char* LCS(const char* shortStr, const char* longStr)
{
	assert(shortStr && longStr);

	int maxlen = 0;
	int row = 0;

	unsigned slen = strlen( shortStr );
	unsigned llen = strlen( longStr );

	unsigned **arr = NULL;

	arr = (unsigned ** ) malloc ( 2 * sizeof(unsigned*));
	memset(arr, 0, 2 * sizeof(unsigned*));
	for(unsigned i = 0; i < 2; ++i)                                  //仅仅申请了两行数据
	{
		arr[i] = (unsigned *) malloc(slen * sizeof(unsigned));
		memset(arr[i], 0, slen * sizeof(unsigned));
	}


	for (unsigned j = 0; j < slen; ++j)
	{
		if (shortStr[j] == longStr[0])
		{
			arr[0][j] = 1;
		}
	}
	for(int p = 0; p < slen ; ++p)
		printf(" %d", arr[0][p]);
	printf("\n");
	for(unsigned i = 1; i < llen ; ++i)
	{
		for (unsigned j = 0; j < slen; ++j)
		{
			if(shortStr[j] == longStr[i])
			{
				if (j == 0)
				{
					arr[1][j] = 1;
				}
				else
					arr[1][j] = arr[0][j-1] + 1;
			}
			if (arr[1][j] > maxlen)
			{
				maxlen = arr[1][j];	
				row = i + 1;
			}
		}
		memcpy(arr[0], arr[1], slen * sizeof(unsigned));
		memset(arr[1], 0, slen * sizeof(unsigned));
		
		for(int p = 0; p < slen ; ++p)                           //输出
			printf(" %d", arr[0][p]);
		printf("\n");

	}


	for(int i = 0; i < 2; ++i)
		free(arr[i]);
	free(arr);

	char *lcsResult = NULL;
	lcsResult = (char*) malloc (maxlen * sizeof(char) + 1);
	memset(lcsResult, 0, maxlen + 1);
	int t;
	int tmp = maxlen;
	for (t = 0; t < maxlen; ++t)
	{
		*(lcsResult + t) = *(longStr + row - tmp);
		tmp -= 1;
	}
	lcsResult[t] = '\0';
	return lcsResult;
}

时间复杂度： O（m*n），空间复杂度： O（2*n）

学习思考：在使用动态规划求最大公共子串LCS时，需要较大的内存开销，即O（m*n），时间复杂度也是O（m*n），从递推公式中我们发现可以把空间复杂度降低到O（2*n）。是一个很大的改进。那时间复杂度有没有更好的优化呢？后缀树提供了一种更优的时间复杂度O（m）。以此为契机，学习后缀树。