187 Repeated DNA Sequences
连接:https://leetcode.com/problems/repeated-dna-sequences/
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
Return:
["AAAAACCCCC", "CCCCCAAAAA"].
Hide Tags Hash Table Bit Manipulation。
这个问题解决思路,把A,C,G,T映射成0,1,2,3,这样用两个bit就可以区分它们00,01,10,11了。一串10个字符的DNA就可以用20位来表达,那么0-2^20就可以表达所有的10个字符的DNA串。由于要返回出现2次以上的DNA串,那么可以建立一个可以表示0,1,2的hash table,出现2次及2次以上用2表示,这最少需要2位来表达,即hash table 中00表示没出现,01表示出现一次,10表示出现2次及2次以上。
所以用hash table 的大小是1024*1024/4字节;
class Solution {
public:
int encode(const char *a)
{
int result=0,i=0;
for(;i<10;i++)
{
switch (a[i])
{
case 'A': break;
case 'C':result+=1;break;
case 'G':result+=2;break;
case 'T':result+=3;break;
default:break;
}
result=result<<2;
}
return result>>2;
};
vector<string> findRepeatedDnaSequences(string s)
{
vector<string> result;
int length=s.size();
if(length<11) return result;
int key,index,offset,i=0;
unsigned char *table=(unsigned char *)malloc(sizeof(unsigned char)*1024*1024/4);
memset(table,0,sizeof(unsigned char)*1024*1024/4);
for(;i<=length-10;i++)
{
key=encode(s.substr(i, 10).c_str());
index=key/4;
offset=2*(key%4);
if(((table[index]>>offset)&3)==0)
{
table[index]|=(1<<offset);
}
else if(((table[index]>>offset)&3)==1)
{
table[index]+=(1<<offset);
result.push_back(s.substr(i, 10));
}
}
return result;
}
};
以上代码提交是可以AC的但是同样的算法,在C语言中提交就通不过,原因是内存超标,但是我认为C中消耗的内存是一定小于C++的。顺便把代码放这里,等想到更好的结局方案的时候再来这里改代码。
int encode(const char *a)
{
int result=0,i=0;
for(;i<10;i++)
{
switch (a[i])
{
case 'A': break;
case 'C':result+=1;break;
case 'G':result+=2;break;
case 'T':result+=3;break;
default:break;
}
result=result<<2;
}
return result>>2;
};
char *decode(int key)
{
int i=9;
char *result=(char*)malloc(11);
result[10]='\0';
for(;0<=i;i--)
{
switch(key&3)
{
case 0: result[i]='A';break;
case 1: result[i]='C';break;
case 2: result[i]='G';break;
case 3: result[i]='T';break;
default:break;
}
key=key>>2;
}
return result;
};
char** findRepeatedDnaSequences(char* s, int* returnSize)
{
int length=0;
int key,index,offset,i=0;
unsigned char *table=(unsigned char *)malloc(sizeof(unsigned char)*1024*1024/4);
char **result;
memset(table,0,sizeof(unsigned char)*1024*1024/4);
if(NULL==s)
return NULL;
while(s[length++]!='\0');
if(length<12)return NULL;
for(;i<length-10;i++)
{
key=encode(s+i);
index=key/4;
offset=2*(key%4);
if(((table[index]>>offset)&3)==0)
{
table[index]|=(1<<offset);
}
else if(((table[index]>>offset)&3)==1)
{
(*returnSize)=(*returnSize)+1;
table[index]+=(1<<offset);
}
}
result=(char**)malloc((*returnSize)*sizeof(char *));
key=0;
index=0;
offset=0;
for(i=0;i<*returnSize;i++)
{
while(((table[index]>>offset)&3)<2)
{
key++;
index=key/4;
offset=2*(key%4);
}
result[i]=decode(key++);
index=key/4;
offset=2*(key%4);
}
free(table);
return result;
}