python 遍历hadoop，跟指定列表对比包含列表中值的取出。-优快云博客

本文介绍了一种使用三元搜索树实现的高效URL匹配系统，该系统能够快速处理大量URL并进行最大匹配，同时记录每个URL出现的次数及最早出现的时间。

import sys
import tstree

fname = 'high_freq_site.list'
tree = tstree.TernarySearchTrie()
tree.loadData(fname)

token = ''
counter = 0
post = []

# url, count, posttime
for line in sys.stdin:
    line = line.strip()
    arr = line.split()
    if len(arr) != 3:
        continue
    
    #print arr
    num = arr[1]
    url = arr[0]
    posttime = int(arr[2])

    if token == '':
        token = url
        counter = 0
        counter += int(num)
        post.append(posttime)
    elif token == url:
        counter += int(num)
        post.append(posttime)
    elif token != url:
        ret = tree.maxMatch(token)
        if ret and post:
            print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post))
        
        token = url
        counter = 0
        counter += int(num)
        post = []

ret = tree.maxMatch(token)
if ret and post:
    print '%s\t%s\t%s\t%s' % (ret, token, counter, min(post))



class TSTNode(object):
    def __init__(self, splitchar):
        self.splitchar = splitchar
        self.data = None

        self.loNode = None
        self.eqNode = None
        self.hiNode = None


class TernarySearchTrie(object):
    def __init__(self):
        self.rootNode = None


    def loadData(self, fname):
        f = open(fname)
        while True:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            node = self.addWord(line)
            if node:
                node.data = line
        f.close()

    
    def addWord(self, word):
        if not word:
            return None

        charIndex = 0
        if not self.rootNode:
            self.rootNode = TSTNode(word[0])

        currentNode = self.rootNode

        while True:
            charComp = ord(word[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if charIndex == len(word):
                    return currentNode
                if not currentNode.eqNode:
                    currentNode.eqNode = TSTNode(word[charIndex])
                currentNode = currentNode.eqNode
            elif charComp < 0:
                if not currentNode.loNode:
                    currentNode.loNode = TSTNode(word[charIndex])
                currentNode = currentNode.loNode
            else:
                if not currentNode.hiNode:
                    currentNode.hiNode = TSTNode(word[charIndex])
                currentNode = currentNode.hiNode


    def maxMatch(self, url):
        ret = None
        currentNode = self.rootNode
        charIndex = 0
        while currentNode:
            if charIndex >= len(url):
                break
            charComp = ord(url[charIndex]) - ord(currentNode.splitchar)
            if charComp == 0:
                charIndex += 1
                if currentNode.data:
                    ret = currentNode.data
                if charIndex == len(url):
                    return ret
                currentNode = currentNode.eqNode
            elif charComp < 0:
                currentNode = currentNode.loNode
            else:
                currentNode = currentNode.hiNode
        return ret


if __name__ == '__main__':
    import sys
    fname = 'high_freq_site.list'
    tree = TernarySearchTrie()
    tree.loadData(fname)

    for url in sys.stdin:
        url = url.strip()
        ret = tree.maxMatch(url)
        print ret