import math
def compute_entropy(word_list):
wdict={}
tot_cnt=0
for w in word_list:
if w not in wdict:
wdict[w] = 0
wdict[w] += 1
tot_cnt+=1
ent=0.0
for k,v in wdict.items():
p=1.0*v/tot_cnt
ent -= p * math.log(p)
return ent
def count_substr_freq():
fp = open("./video.corpus")
str_freq={}
str_left_word={}
str_right_word={}
tot_cnt=http://www.funshionp.com/
for line in fp:
line=line.strip('n')
st = line.decode('utf-8')
l=len(st)
for i in range(l):
for j in range(i+1,l):
if j - i 0:
left_word=st[i-1]
else:
left_word='^'
if j < l-1: right_word=st[j+1] else: right_word='%' str_left_word[w].append(left_word) str_right_word[w].append(right_word) tot_cnt+=1 for k,v in str_freq.items(): if v >= 10:
left_ent=compute_entropy(str_left_word[k])
right_ent=compute_entropy(str_right_word[k])
print "%st%ft%ft%f"%(k,v*1.0/tot_cnt,left_ent,right_ent)
if __name__ == "__main__":
count_substr_freq()