From: https://leetcode.com/problems/repeated-dna-sequences/
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
Solution:
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ans;
int len=s.size();
if(len < 10) return ans;
const char A = 'A';
unsigned char convert[26];
convert[0] = 0; // A:00
convert['C'-A] = 1; // C:01
convert['G'-A] = 2; // G:10
convert['T'-A] = 3; // T:11
bool hashMap[1024*1024] = {false};
int hashValue = 0;
for(int pos=0; pos<10; ++pos) {
hashValue <<= 2;
hashValue |= convert[s[pos]-A];
}
hashMap[hashValue] = true;
unordered_set<int> ansContain;
for(int pos=10; pos<len; ++pos) {
hashValue <<= 2;
hashValue |= convert[s[pos]-A];
hashValue &= ~(0xf00000);
if(hashMap[hashValue]) {
if(ansContain.find(hashValue) == ansContain.end()) {
ans.push_back(s.substr(pos-9, 10));
ansContain.insert(hashValue);
}
} else {
hashMap[hashValue] = true;
}
}
return ans;
}
};
public class Solution {
public List<String> findRepeatedDnaSequences(String s) {
List<String> ans = new ArrayList<String>();
int len;
if (s == null || (len = s.length()) < 10) {
return ans;
}
char[] convert = new char[26];
convert[0] = 0; // A:00
convert['C' - 'A'] = 1; // C:01
convert['G' - 'A'] = 2; // G:10
convert['T' - 'A'] = 3; // T:11
boolean[] hashMap = new boolean[1024 * 1024];
int hashValue = 0; // 20位的hash值
for (int i = 0; i < 10; ++i) {
hashValue <<= 2;
hashValue |= convert[s.charAt(i) - 'A'];
}
hashMap[hashValue] = true;
Set<Integer> ansContain = new HashSet<Integer>();
for (int i = 10; i < len; ++i) {
hashValue <<= 2;// 平移两位
hashValue = (hashValue | convert[s.charAt(i) - 'A']) & 0x000fffff;// 更新截取后20位
if (hashMap[hashValue] && !ansContain.contains(hashValue)) {
ans.add(s.substring(i - 9, i + 1));
ansContain.add(hashValue);
}
hashMap[hashValue] = true;
}
return ans;
}
}
本文介绍了一个函数,用于在DNA分子中查找所有长度为10的重复子串。通过使用哈希表和位操作来实现高效的查找过程。
395

被折叠的 条评论
为什么被折叠?



