UVALive 5794 (UVA 12361) File Retrieval 后缀数组 + 分治dfs

最新推荐文章于 2025-08-10 12:44:51 发布

Gatevin

最新推荐文章于 2025-08-10 12:44:51 发布

阅读量943

点赞数

CC 4.0 BY-SA版权

分类专栏： UVA ACM_Suffix_Array UVALive 文章标签： UVALive 5794 UVA 12361 File Retrieval 后缀树组分治

本文链接：https://blog.youkuaiyun.com/gatevin/article/details/44226323

ACM_Suffix_Array 同时被 3 个专栏收录

35 篇文章

订阅专栏

UVALive

21 篇文章

订阅专栏

UVA

18 篇文章

订阅专栏

本文探讨了一种使用后缀树和分治策略解决字符串匹配问题的方法，具体包括如何通过构建后缀数组和后缀树来高效地查找包含特定子串的所有字符串，并通过状态压缩和集合操作来统计不同可能性的数量。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

题目大意：

就是现在给出F(1 <= F <= 60)个字符串, 每个字符串的长度不超过 10^4, 并且每个字符串只包含小写英文字母, 现在有一个搜索装置, 如果对于字符串s进行搜索, 那么所有的F个给定的字符串中包含s作为子串的字符串都被搜索出来, 问呗搜索出来的所有字符串组成的集合有多少种不同的可能

大致思路：

首先不难想到将F个串中间用未出现的不同字符隔开连接起来, 然后处理出后缀数组, 然后我们可以用后缀树的样子来看这些后缀之间的关系(后缀树组作为工具), 不能发现如果某一段连续的极大区间[L, R]中的height值都 >= h(也就是这个区间旁边的 < h),那么我们对于后缀sa[L - 1], sa[L], .... sa[R]的前 <= h个字符进行搜索这几个后缀所在字符串一定会同时出现(从后缀树上的公共部分到一个深度为h的分支, 很容易看出), 那么我们利用height数组进行分治, 用h = 0得到所有连续的 > 0的height区间段, 对于这些段搜索即可, 然后对于>= h的这一连续区间段继续进行分割(取这个区间当中不是h的最小的下一个h进行分割), 这样进行分治, 复杂度O(nlogn), 至于集合的状态用状态压缩表示之后塞到set当中即可, 最后统计set中元素的个数

当然以上从h = 0开始分治时会忽略一种情况, 就是字符串单独出现的情况, 这样的情况只需要利用后缀树组的height数组判断两个后缀的LCP的长度是不是等于串长即可判断子串问题, 成为另外一个字符串的子串的字符串不可能单独出现, 这样所有情况就考虑完毕了

代码如下：

Result : Accepted Memory : ? KB Time : 812 ms (UVALive), 1502 ms (UVA)

/*
 * Author: Gatevin
 * Created Time:  2015/3/12 16:14:14
 * File Name: Kotori_Itsuka.cpp
 */
#include<iostream>
#include<sstream>
#include<fstream>
#include<vector>
#include<list>
#include<deque>
#include<queue>
#include<stack>
#include<map>
#include<set>
#include<bitset>
#include<algorithm>
#include<cstdio>
#include<cstdlib>
#include<cstring>
#include<cctype>
#include<cmath>
#include<ctime>
#include<iomanip>
using namespace std;
const double eps(1e-8);
typedef long long lint;
#define maxn 600100

int wa[maxn], wb[maxn], wv[maxn], Ws[maxn];
char in[maxn];
int F, N, s[maxn], sa[maxn], belong[maxn], rank[maxn], height[maxn], L[66];
set <lint> S;
bool single[66];

int cmp(int *r, int a, int b, int l)
{
    return r[a] == r[b] && r[a + l] == r[b + l];
}

void da(int *r, int *sa, int n, int m)
{
    int *x = wa, *y = wb, *t, i, j, p;
    for(i = 0; i < m; i++) Ws[i] = 0;
    for(i = 0; i < n; i++) Ws[x[i] = r[i]]++;
    for(i = 1; i < m; i++) Ws[i] += Ws[i - 1];
    for(i = n - 1; i >= 0; i--) sa[--Ws[x[i]]] = i;
    for(j = 1, p = 1; p < n; j *= 2, m = p)
    {
        for(p = 0, i = n - j; i < n; i++) y[p++] = i;
        for(i = 0; i < n; i++) if(sa[i] >= j) y[p++] = sa[i] - j;
        for(i = 0; i < n; i++) wv[i] = x[y[i]];
        for(i = 0; i < m; i++) Ws[i] = 0;
        for(i = 0; i < n; i++) Ws[wv[i]]++;
        for(i = 1; i < m; i++) Ws[i] += Ws[i - 1];
        for(i = n - 1; i >= 0; i--) sa[--Ws[wv[i]]] = y[i];
        for(t = x, x = y, y = t, p = 1, x[sa[0]] = 0, i = 1; i < n; i++)
            x[sa[i]] = cmp(y, sa[i - 1], sa[i], j) ? p - 1 : p++;
    }
    return;
}

void calheight(int *r, int *sa, int n)
{
    int i, j, k = 0;
    for(i = 1; i <= n; i++) rank[sa[i]] = i;
    for(i = 0; i < n; height[rank[i++]] = k)
        for(k ? k-- : 0, j = sa[rank[i] - 1]; r[i + k] == r[j + k]; k++);
    return;
}

/*
 * dfs(L, R, h)寻找的是连续的height[i] >= h的一段height数组 L <= i <= R中的所有解
 * 那么对[L, R]当中的所有串的前不超过h个字符的串进行搜索都将搜索到所有的这[L, R]中的串
 * 统计这种情况下的集合状况插入set之后, 继续对这个区间进行分割, 以大于h的最小的新h进行分割
 * 也就是分治的思想
 * 初始的时候用的h = 0分割, height[i] = 0造成特殊情况是造成集合元素个数为1的没有统计
 * 所以在dfs之前统计一下单个字符串出现的可能
 */
void dfs(int L, int R, int h)
{
    int i = L;
    while(i <= R)
    {
        while(i <= R && height[i] == h) i++;
        if(i > R) break;
        int j = i;
        lint pos = 1LL << belong[sa[i - 1]];
        int pre = height[i];
        while(j <= R && height[j] > h)
            pos |= (1LL << belong[sa[j]]), pre = min(pre, height[j]), j++;
        S.insert(pos);
        dfs(i, j - 1, pre);
        i = j;
    }
    return;
}

void solve()
{
    S.clear();
    memset(single, 0, sizeof(single));
    for(int i = 1; i <= N; i++)//如果一个字符串不是其他串的子串即可单独出现
    {
        if(belong[sa[i - 1]] != -1 && L[belong[sa[i - 1]]] == height[i]) single[belong[sa[i - 1]]] = 1;
        if(belong[sa[i]] != -1 && L[belong[sa[i]]] == height[i]) single[belong[sa[i]]] = 1;
    }
    for(int i = 1; i <= F; i++)
        if(!single[i]) S.insert(1LL << i);
    dfs(1, N, 0);
    printf("%d\n", S.size());
    return;
}

int main()
{
    while(scanf("%d", &F), F)
    {
        N = 0;
        memset(belong, 0, sizeof(belong));
        for(int i = 1; i <= F; i++)
        {
            scanf("%s", in);
            int tlen = strlen(in);
            L[i] = tlen;
            for(int j = 0; j < tlen; j++)
            {
                belong[N] = i;
                s[N++] = in[j] - 'a' + 1;
            }
            belong[N] = -1;
            s[N++] = 26 + i;
        }
        N--;
        s[N] = 0;
        da(s, sa, N + 1, 90);
        calheight(s, sa, N);
        solve();
    }
    return 0;
}