要用好后缀数组要先理解里面几个数组的概念:
sa[i]
表示字典序第i大的后缀下标(字典序排名依次是
1−len(string)
);
rank[i]
表示下标为i的后缀字典序排名;
height[i]
表示
sa[i]
和
sa[i−1]
最长公共前缀的长度.
一个性质:
倍增法求出这些数组, 记得在原串的末尾增加一个0:
int t1[maxn],t2[maxn],c[maxn];
bool cmp(int *r,int a,int b,int l)
{
return r[a] == r[b] && r[a+l] == r[b+l];
}
void da(int str[],int sa[],int rank[],int height[],int n,int m)
{
n++;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[i] = str[i]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;
for(j = 1; j <= n; j <<= 1)
{
p = 0;
//直接利用sa数组排序第二关键字
for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小
for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[y[i]]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x,y);
p = 1;
x[sa[0]] = 0;
for(i = 1; i < n; i++)
x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;
if(p >= n)break;
m = p;//下次基数排序的最大值
}
int k = 0;
n--;
for(i = 0; i <= n; i++)rank[sa[i]] = i;
for(i = 0; i < n; i++)
{
if(k) k--;
j = sa[rank[i]-1];
while(str[i+k] == str[j+k])
k++;
height[rank[i]] = k;
}
}
POJ 1743 最长不重叠相同子串
题意: 求两个最长的不重叠子串, 满足两个串对应下标的差值相等.
对于相邻的两个数直接做差得到一个新串, 直接对新串求最长不重叠子串. 非常经典的做法, 二分长度, 然后对height数组分组, 满足这个长度的分成一组, 然后判断组中下标最大最小之差.
数据很水~
#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#define maxn 20005
using namespace std;
int t1[maxn],t2[maxn],c[maxn];
bool cmp(int *r,int a,int b,int l)
{
return r[a] == r[b] && r[a+l] == r[b+l];
}
void da(int str[],int sa[],int rank[],int height[],int n,int m)
{
n++;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[i] = str[i]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;
for(j = 1; j <= n; j <<= 1)
{
p = 0;
//直接利用sa数组排序第二关键字
for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小
for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[y[i]]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x,y);
p = 1;
x[sa[0]] = 0;
for(i = 1; i < n; i++)
x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;
if(p >= n)break;
m = p;//下次基数排序的最大值
}
int k = 0;
n--;
for(i = 0; i <= n; i++)rank[sa[i]] = i;
for(i = 0; i < n; i++)
{
if(k) k--;
j = sa[rank[i]-1];
while(str[i+k] == str[j+k])
k++;
height[rank[i]] = k;
}
}
int rank[maxn], height[maxn];
int str[maxn];
int sa[maxn];
int n;
#define INF 111111
bool ok (int x) {
int Min = INF, Max = 0;
for (int i = 1; i <= n; i++) {
if (height[i] >= x) {
Min = min (Min, sa[i]);
Max = max (Max, sa[i]);
}
else {
if (Max-Min >= x)
return 1;
Max = sa[i];
Min = sa[i];
}
}
return Max-Min >= x;
}
int solve () {
int l = 0, r = n/2;
while (r-l > 1) {
int mid = (r+l)>>1;
if (ok (mid)) l = mid;
else r = mid;
}
return (ok (r) ? r : l);
}
int main(){
while (scanf ("%d", &n) == 1 && n) {
for (int i = 0; i < n; i++) {
scanf ("%d", &str[i]);
}
if (n <= 9) {
printf ("0\n");
continue;
}
n--;
for (int i = 0; i < n; i++) {
str[i] = 100+str[i+1]-str[i];
}
str[n] = 0;
da(str, sa, rank, height, n+1, 188);
int ans = solve ()+1;
printf ("%d\n", (ans >= 5 ? ans : 0));
}
return 0;
}
POJ 3261 求重复k次的最长子串
还是二分结果, 按照height分组, 判断是不是有大于k的组.
数据还是很水, 不加离散化都能过~
#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#define maxn 200005
using namespace std;
int t1[maxn],t2[maxn],c[maxn];
bool cmp(int *r,int a,int b,int l)
{
return r[a] == r[b] && r[a+l] == r[b+l];
}
void da(int str[],int sa[],int rank[],int height[],int n,int m)
{
n++;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[i] = str[i]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;
for(j = 1; j <= n; j <<= 1)
{
p = 0;
//直接利用sa数组排序第二关键字
for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小
for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[y[i]]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x,y);
p = 1;
x[sa[0]] = 0;
for(i = 1; i < n; i++)
x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;
if(p >= n)break;
m = p;//下次基数排序的最大值
}
int k = 0;
n--;
for(i = 0; i <= n; i++)rank[sa[i]] = i;
for(i = 0; i < n; i++)
{
if(k) k--;
j = sa[rank[i]-1];
while(str[i+k] == str[j+k])
k++;
height[rank[i]] = k;
}
}
int rank[maxn], height[maxn];
int str[maxn];
int sa[maxn];
int n, k;
bool ok (int x) {
int ans = 1;
for (int i = 2; i <= n; i++) {
if (height[i] >= x) {
ans++;
if (ans >= k)
return 1;
}
else
ans = 1;
}
return 0;
}
int solve () {
int l = 0, r = n;
while (r-l > 1) {
int mid = (l+r) >>1;
if (ok (mid)) l = mid;
else r= mid;
}
return (ok (r) ? r : l);
}
int cnt, num[maxn], gg[maxn];
int lisanhua () {
cnt = 0;
for (int i = 0; i < n; i++) num[i] = i;
sort (num, num+n);
for (int i = 0; i < n; i++) if (!i || num[i] != num[i-1])
gg[cnt++] = num[i];
for (int i = 0; i < n; i++)
str[i] = lower_bound (gg, gg+cnt, str[i])-gg+1;
return cnt+1;
}
int main(){
while (cin >> n >> k) {
int Max = 0;
for (int i = 0; i < n; i++) {
cin >> str[i];
Max = max (Max, str[i]);
}
str[n] = 0;
int m = lisanhua ();
da (str, sa, rank, height, n, m+2);
int ans = solve ();
cout << ans << endl;
}
return 0;
}
/*
2 2
1 1
*/
SPOJ 694 不重复子串个数
根据
sa
数组和
height
数组的含义,
sa[i]
后缀总共有
n−sa[i]
个前缀, 有
height[i]
个前缀和之前的重复, 所以要减去. 最后答案是
∑ni=1n−sa[i]−height[i]
.
#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#define maxn 200005
using namespace std;
int t1[maxn],t2[maxn],c[maxn];
bool cmp(int *r,int a,int b,int l)
{
return r[a] == r[b] && r[a+l] == r[b+l];
}
void da(int str[],int sa[],int rank[],int height[],int n,int m)
{
n++;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[i] = str[i]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;
for(j = 1; j <= n; j <<= 1)
{
p = 0;
//直接利用sa数组排序第二关键字
for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小
for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[y[i]]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x,y);
p = 1;
x[sa[0]] = 0;
for(i = 1; i < n; i++)
x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;
if(p >= n)break;
m = p;//下次基数排序的最大值
}
int k = 0;
n--;
for(i = 0; i <= n; i++)rank[sa[i]] = i;
for(i = 0; i < n; i++)
{
if(k) k--;
j = sa[rank[i]-1];
while(str[i+k] == str[j+k])
k++;
height[rank[i]] = k;
}
}
int rank[maxn], height[maxn];
int str[maxn];
char s[maxn];
int sa[maxn];
int n, k;
int main(){
ios::sync_with_stdio(0);
int t;
cin >> t;
while (t--) {
cin >> s;
n = strlen (s);
for (int i = 0; i < n; i++) str[i] = s[i];
str[n] = 0;
da (str, sa, rank, height, n, 233);
long long ans = 0;
for (int i = 1; i <= n; i++) {
ans += n-sa[i]-height[i];
}
cout << ans << endl;
}
return 0;
}
POJ 2774 最长公共子串
把第二个串放到第一个串的后面, 中间用一个失配符隔开, 然后遍历height数组维护最大子串长度. 要避免出现在同一串中的公共子串.
#include <iostream>
#include <cstdio>
#include <cstring>
#include <queue>
#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#define maxn 200005
using namespace std;
int t1[maxn],t2[maxn],c[maxn];
bool cmp(int *r,int a,int b,int l)
{
return r[a] == r[b] && r[a+l] == r[b+l];
}
void da(int str[],int sa[],int rank[],int height[],int n,int m)
{
n++;
int i, j, p, *x = t1, *y = t2;
//第一轮基数排序,如果s的最大值很大,可改为快速排序
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[i] = str[i]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[i]]] = i;
for(j = 1; j <= n; j <<= 1)
{
p = 0;
//直接利用sa数组排序第二关键字
for(i = n-j; i < n; i++)y[p++] = i;//后面的j个数第二关键字为空的最小
for(i = 0; i < n; i++)if(sa[i] >= j)y[p++] = sa[i] - j;
//这样数组y保存的就是按照第二关键字排序的结果
//基数排序第一关键字
for(i = 0; i < m; i++)c[i] = 0;
for(i = 0; i < n; i++)c[x[y[i]]]++;
for(i = 1; i < m; i++)c[i] += c[i-1];
for(i = n-1; i >= 0; i--)sa[--c[x[y[i]]]] = y[i];
//根据sa和x数组计算新的x数组
swap(x,y);
p = 1;
x[sa[0]] = 0;
for(i = 1; i < n; i++)
x[sa[i]] = cmp(y,sa[i-1],sa[i],j)?p-1:p++;
if(p >= n)break;
m = p;//下次基数排序的最大值
}
int k = 0;
n--;
for(i = 0; i <= n; i++)rank[sa[i]] = i;
for(i = 0; i < n; i++)
{
if(k) k--;
j = sa[rank[i]-1];
while(str[i+k] == str[j+k])
k++;
height[rank[i]] = k;
}
}
int rank[maxn], height[maxn];
int str[maxn];
char s1[maxn], s2[maxn];
int sa[maxn];
int n, m, len;
bool legal (int i, int j) {
if (i > j) swap (i, j);
return (i < n && j > n);
}
void solve () {
int Max = 0;
for (int i = 2; i <= len; i++) {
if (height[i] >= Max && legal (sa[i], sa[i-1]))
Max = height[i];
}
cout << Max << endl;
}
int main(){
ios::sync_with_stdio(0);
while (cin >> s1 >> s2) {
n = strlen (s1), m = strlen (s2);
for (int i = 0; i < n; i++) str[i] = s1[i];
str[n] = 1;
for (int i = n+1; i <= n+m; i++) str[i] = s2[i-n-1];
len = n+m+1;
str[len] = 0;
da (str, sa, rank, height, len, 233);
solve ();
}
return 0;
}