【leetcode】Repeated DNA Sequences

最新推荐文章于 2022-07-28 15:01:27 发布

原创最新推荐文章于 2022-07-28 15:01:27 发布 · 390 阅读

0 ·

CC 4.0 BY-SA版权

Leetcode 专栏收录该内容

222 篇文章

订阅专栏

本文介绍了一个函数，用于在DNA分子中查找所有长度为10的重复子串。通过使用哈希表和位操作来实现高效的查找过程。

From： https://leetcode.com/problems/repeated-dna-sequences/

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.

Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.

For example,

Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",

Return:
["AAAAACCCCC", "CCCCCAAAAA"].

Hide Tags

Hash Table Bit Manipulation

Solution：

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> ans;
        int len=s.size();
        if(len < 10) return ans;
        
        const char A = 'A';
        unsigned char convert[26];
        convert[0] = 0;     // A:00
        convert['C'-A] = 1; // C:01
        convert['G'-A] = 2; // G:10
        convert['T'-A] = 3; // T:11
        bool hashMap[1024*1024] = {false};
        int hashValue = 0;
        for(int pos=0; pos<10; ++pos) {
            hashValue <<= 2;
            hashValue |= convert[s[pos]-A];
        }
        hashMap[hashValue] = true;
        unordered_set<int> ansContain;
        
        for(int pos=10; pos<len; ++pos) {
            hashValue <<= 2;
            hashValue |= convert[s[pos]-A];
            hashValue &= ~(0xf00000);
            if(hashMap[hashValue]) {
                if(ansContain.find(hashValue) == ansContain.end()) {
                    ans.push_back(s.substr(pos-9, 10));
                    ansContain.insert(hashValue);
                }
            } else {
                hashMap[hashValue] = true;
            }
        }
        
        
        return ans;
    }
};

public class Solution {
    public List<String> findRepeatedDnaSequences(String s) {
        List<String> ans = new ArrayList<String>();
		int len;
		if (s == null || (len = s.length()) < 10) {
			return ans;
		}

		char[] convert = new char[26];
		convert[0] = 0; // A:00
		convert['C' - 'A'] = 1; // C:01
		convert['G' - 'A'] = 2; // G:10
		convert['T' - 'A'] = 3; // T:11
		boolean[] hashMap = new boolean[1024 * 1024];
		int hashValue = 0; // 20位的hash值
		for (int i = 0; i < 10; ++i) {
			hashValue <<= 2;
			hashValue |= convert[s.charAt(i) - 'A'];
		}
		hashMap[hashValue] = true;
		Set<Integer> ansContain = new HashSet<Integer>();

		for (int i = 10; i < len; ++i) {
			hashValue <<= 2;// 平移两位
			hashValue = (hashValue | convert[s.charAt(i) - 'A']) & 0x000fffff;// 更新截取后20位
			if (hashMap[hashValue] && !ansContain.contains(hashValue)) {
				ans.add(s.substring(i - 9, i + 1));
				ansContain.add(hashValue);
			}
			hashMap[hashValue] = true;

		}
		return ans;
    }
}