All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
Return:
["AAAAACCCCC", "CCCCCAAAAA"].
先献出自己的菜鸡想法,利用一个map存储每10个子元素组成的字串,并++,最后在map中找出出现次数大于1的字串;
vector<string> findRepeatedDnaSequences(string s) {
if (s.size() < 10)return vector<string>{};
unordered_map<string, int> temp;
vector<string> res;
int len = s.size();
for (int i = 0; i <= len - 10; i++){
temp[s.substr(i, 10)]++;
}
for (auto x : temp){
if (x.second > 1)res.push_back(x.first);
}
return res;
}
写完之后自己都感觉的弱鸡的不行,这样带来的空间复杂度可能会很大,但还好时间复杂度依旧是O(N),不过还是不理想;
思路2:LeetCode上参考的代码,主要运用了位操作,其实也没有什么太过于神奇的地方,但是空间复杂度明显下降。
int char2val(char c){
switch (c){
case 'A':return 0;
case 'C':return 1;
case 'G':return 2;
case 'T':return 3;
default:return 0;
}
}
vector<string> findRepeatedDnaSequences(string s) {
if (s.size() < 10)return vector<string>{};
vector<string> res;
bitset<1 << 20> s1, s2;
int mask = (1 << 20) - 1;
int val = 0;
for (int i = 0; i < 10; i++){
val = (val << 2) | char2val(s[i]);
}
s1.set(val);
for (int i = 10; i < s.size(); i++){
val = ((val << 2) & mask) | char2val(s[i]);
if (s2[val])continue;
if (s1[val]){
res.push_back(s.substr(i - 10 + 1, 10));
s2.set(val);
}
else{
s1.set(val);
}
}
return res;
}