1.数据结构和算法
1.15根据字段将记录分组
rows = [
{'address': '5412 N CLARK', 'date': '07/01/2012'},
{'address': '5148 N CLARK', 'date': '07/04/2012'},
{'address': '5800 E 58TH', 'date': '07/02/2012'},
{'address': '2122 N CLARK', 'date': '07/03/2012'},
{'address': '5645 N REVENSWOOD', 'date': '07/02/2012'},
{'address': '1060 W ADDISON', 'date': '07/02/2012'},
{'address': '4801 N BROADWAY', 'date': '07/01/2012'},
{'address': '1039 W GRANVILLE', 'date': '07/04/2012'}
]
from operator import itemgetter
from itertools import groupby
#grounpby()只能检查连续项,要先排序
rows.sort(key=itemgetter('date'))
for date, items in groupby(rows, key=itemgetter('date')):
print(date)
for i in items:
print(' ', i)
#简单根据日期将数据分组到一起,放进一个大的数据中心以允许进行随机访问
from collections import defaultdict
rows_by_date = defaultdict(list)
for row in rows:
rows_by_date[row['date']].append(row)
for r in rows_by_date['07/01/2012']:
print(r)
1.16筛选序列中的元素
#列表推导式
mylist = [1, 4, -5, 10, -7, 2, 3, -1]
print([n for n in mylist if n > 0])
#原始输入庞大
pos = (n for n in mylist if n > 0)
print(pos)
for x in pos:
print(x)
#创建筛选逻辑代码放到单独的函数, 使用内建的filter函数
values = ['1', '2', '-3', '-', '4', 'N/A', '5']
def is_int(val):
try:
x = int(val)
return True
except ValueError:
return False
ivals = list(filter(is_int, values))
print(ivals)
#替换不合符标准的值
clip_neg = [n if n > 0 else 0 for n in mylist]
print(clip_neg)
#筛选工具compress, 接收可迭代对象和一个布尔选择器
address = [
'5412 N CLARK',
'5148 N CLARK',
'5800 E 58TH',
'2122 N CLARK',
'5645 N REVENSWOOD',
'1060 W ADDISON',
'4801 N BROADWAY',
'1039 W GRANVILLY'
]
counts = [0, 3, 10, 4, 1, 7, 6, 1]
from itertools import compress
more5 = [n > 5 for n in counts]
print(more5)
print(list(compress(address, more5)))
1.17从字典中提取子集
#字典推导式
prices = {
'ACME': 45.23,
'AAPL': 612.78,
'IBM': 205.55,
'HPQ': 37.20,
'FB': 10.75
}
p1 = { key: value for key, value in prices.items() if value > 200}
print(p1)
tech_names = { 'AAPL', 'IBM', 'HPQ', 'MSFT'}
p2 = { key: value for key, value in prices.items() if key in tech_names}
print(p2)
#创建元组序列,传给dict()函数
p3 = dict((key, value) for key, value in prices.items() if value > 200)
print(p3)
1.18将名称映射到序列元素中
from collections import namedtuple
#返回一个可实例化的类, 传入类名, 和名称, 支持普通元组操作
Subscriber = namedtuple('Subscriber', ['addr', 'joined'])
sub = Subscriber('jonesy@example.com', '2012-10-19')
print(sub)
print(sub.addr)
print(sub.joined)
#namedtuple不可变, 修改属性可使用_replace()方法实现, 该方法创建全新的命名元组
Stock = namedtuple('Stock', ['name', 'shares', 'prices'])
s = Stock('ACME', 100, 123.45)
print(s)
s = s._replace(shares=75)
print(s)
1.19同时对数据做转换和换算
#在函数参数中使用生成器
nums = [1, 2, 3, 4, 5]
s = sum(x*x for x in nums)
import os
files = os.listdir('dirname')
if any(name.endswith('.py') for name in files):
print('There be python')
else:
print('sorry, no python')
s = ('ACME', 50, 123.45)
print(','.join(str(x) for x in s))
portfolio = [
{'name': 'GOOG', 'shares': 50},
{'name': 'YHOO', 'shares': 75},
{'name': 'AOL', 'shares': 20},
{'name': 'SCOX', 'shares': 65}
]
min_shares = min(s['shares'] for s in portfolio)
1.20将多个映射合并为单个映射
a = {'x': 1, 'z': 3}
b = {'y': 2, 'z': 4}
from collections import ChainMap
#ChainMap使用的是原始字典
c = ChainMap(a, b)
print(c)
#采用第一个映射中所对应的值
print(c['z'])
#修改映射的操作作用在列出的第一个映射结构上
c['z'] = 10
print(a)
c['w'] = 40
print(a)
del c['x']
print(a)
values = ChainMap()
values['x'] = 1
#添加新映射
values = values.new_child()
values['x'] = 2
values = values.new_child()
values['x'] = 3
print(values)
values = values.parents
print(values)
values = values.parents
print(values)
本文探讨了数据处理中的关键算法和技巧,包括记录分组、序列筛选、字典子集提取、命名元组使用、数据转换与合并等,旨在提高数据处理效率和算法性能。
294

被折叠的 条评论
为什么被折叠?



