《Python for Data Analysis》
path = 'cho2/usagov_bitly_data2012-03-16-1331923249.txt'
import json
records = [json.loads(line) for line in open(path)]
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
dict (基础用法)
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
def top_counts(count_dict, n = 10):
value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
使用标准Python库
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # 所有值会被初始化为0
for x in sequence:
counts[x] += 1
return counts
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
使用pandas
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame['tz'][:10]
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
1788

被折叠的 条评论
为什么被折叠?



