187 Repeated DNA Sequences-优快云博客

本文介绍了一种高效查找DNA分子中长度为10的重复子序列的方法。通过将DNA字符映射为二进制数，利用哈希表记录并找出所有出现次数超过一次的序列。

187 Repeated DNA Sequences

连接：https://leetcode.com/problems/repeated-dna-sequences/
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",

Return:
["AAAAACCCCC", "CCCCCAAAAA"].

Hide Tags Hash Table Bit Manipulation。

这个问题解决思路，把A，C，G，T映射成0，1，2，3，这样用两个bit就可以区分它们00，01，10，11了。一串10个字符的DNA就可以用20位来表达，那么0-2^20就可以表达所有的10个字符的DNA串。由于要返回出现2次以上的DNA串，那么可以建立一个可以表示0，1，2的hash table，出现2次及2次以上用2表示，这最少需要2位来表达，即hash table 中00表示没出现，01表示出现一次，10表示出现2次及2次以上。
所以用hash table 的大小是1024*1024/4字节；

class Solution {  
public:  
    int encode(const char *a)
{
    int result=0,i=0;
    for(;i<10;i++)
    {
        switch (a[i])
        {
        case 'A': break;
        case 'C':result+=1;break;
        case 'G':result+=2;break;
        case 'T':result+=3;break;
        default:break;
        }
        result=result<<2;
    }
    return result>>2;
};
vector<string> findRepeatedDnaSequences(string s) 
{
    vector<string> result;
    int length=s.size();
    if(length<11) return result;

    int key,index,offset,i=0;
    unsigned char *table=(unsigned char *)malloc(sizeof(unsigned char)*1024*1024/4);
    memset(table,0,sizeof(unsigned char)*1024*1024/4); 

    for(;i<=length-10;i++)
    {
       key=encode(s.substr(i, 10).c_str());   
       index=key/4;
       offset=2*(key%4);
       if(((table[index]>>offset)&3)==0)
       {
        table[index]|=(1<<offset);
       }
       else if(((table[index]>>offset)&3)==1)
       {
        table[index]+=(1<<offset);
        result.push_back(s.substr(i, 10));
       }
    }
    return result;
} 
};

以上代码提交是可以AC的但是同样的算法，在C语言中提交就通不过，原因是内存超标，但是我认为C中消耗的内存是一定小于C++的。顺便把代码放这里，等想到更好的结局方案的时候再来这里改代码。

int encode(const char *a)
{
    int result=0,i=0;
    for(;i<10;i++)
    {
        switch (a[i])
        {
        case 'A': break;
        case 'C':result+=1;break;
        case 'G':result+=2;break;
        case 'T':result+=3;break;
        default:break;
        }
        result=result<<2;
    }
    return result>>2;
};
char *decode(int key)
{
    int i=9;
    char *result=(char*)malloc(11);
    result[10]='\0';

    for(;0<=i;i--)
    {
      switch(key&3)
      {
        case 0: result[i]='A';break;
        case 1: result[i]='C';break;
        case 2: result[i]='G';break;
        case 3: result[i]='T';break;
        default:break;
      }
     key=key>>2;
    }
    return result;
};
char** findRepeatedDnaSequences(char* s, int* returnSize) 
{
    int length=0;
    int key,index,offset,i=0;
    unsigned char *table=(unsigned char *)malloc(sizeof(unsigned char)*1024*1024/4);
    char **result;

    memset(table,0,sizeof(unsigned char)*1024*1024/4); 

    if(NULL==s)
        return NULL;

    while(s[length++]!='\0');

    if(length<12)return NULL;

    for(;i<length-10;i++)
    {
       key=encode(s+i);   
       index=key/4;
       offset=2*(key%4);
       if(((table[index]>>offset)&3)==0)
       {
        table[index]|=(1<<offset);
       }
       else if(((table[index]>>offset)&3)==1)
       {
        (*returnSize)=(*returnSize)+1;
        table[index]+=(1<<offset);
       }
    }

    result=(char**)malloc((*returnSize)*sizeof(char *));
    key=0;
    index=0;
    offset=0;
    for(i=0;i<*returnSize;i++)
    {
        while(((table[index]>>offset)&3)<2)
        {
            key++;
            index=key/4;
            offset=2*(key%4);
        }
        result[i]=decode(key++);
        index=key/4;
        offset=2*(key%4);
    }
    free(table);
    return result;
}