Ch4-Ch7 线性代数、统计学、概率、假设与推断
此系列记录《数据科学入门》学习笔记
Ch 4 线性代数
4.1 向量
# 向量加减法
def vector_add(v, m):
return [v_i + w_i for v_i, w_i in zip(v, w)]
def vector_subtract(v, m):
return [v_i - w_i for v_i, w_i in zip(v, w)]
# 一系列向量的加法
def vector_sum(vectors):
result = vectors[0]
for vector in vectors[1:]:
result = vector_add(result, vector)
return result
def vector_sum1(vectors):
return reduce(vector_add, vectors)
vector_sum = patial(reduce, vector_add)
# 标量乘以向量
def scalar_multiply(c, v):
return [c * v_i for v_i in v]
# 计算一系列长度相同的向量的均值
def vector_meam(vectors):
n = len(vectors)
return scalar_multiply(1/n, vector_sum(vectors))
# 两个向量的点乘
def dot(v, w):
return sum(v_i * w_i for v_i, w_i in zip(v, w))
# 计算向量的平方
def sum_of_squares(v):
return dot(v, v)
# 计算向量的长度、距离
import math
def magnitude(v):
return math.sqrt(sum_of_squares(v))
def squared_distance(v, w):
return sum_of_squares(vector_subtract(v, w))
def distance(v, w):
return math.sqrt(squared_distance(v, w))
def distance(v, w):
return magnitude(vector_substract(v, w))
4.2 矩阵
# 求维数
def shape(a):
num_rows = len(a)
num_cols = len(a[0]) if a else 0
return num_rows, num_cols
# 返回矩阵的某一行或者某一列
def get_row(a, i):
return a[i]
def get_col(a, j):
return a[:,j]
def get_col(a, j):
return [a_i[j] for a_i in a]
# 根据形状和用来生成元素的函数来创建矩阵
def make_matrix(num_rows, num_cols, entry_fn):
return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]
def is_diagonal(i, j):
return 1 if i == j else 0
identify_matrix = make_matrix(5, 5, is_diagonal)
identify_matrix
#[[1, 0, 0, 0, 0],
# [0, 1, 0, 0, 0],
# [0, 0, 1, 0, 0],
# [0, 0, 0, 1, 0],
# [0, 0, 0, 0, 1]]
Ch 5 统计学
5.1 描述单个数据集
num_friends = [100, 9, 1, 4, 25, 63, 5, 16, 5, 9, 5, 25, 2, 15, 26, 2, 6, 4, 15, 16, 4, 26, 22, 13, 16, 45,
23, 8, 9, 15, 6, 12, 5, 85, 12, 14, 2, 17, 5, 13, 42, 51, 5, 4, 26, 2, 4, 9, 5, 12, 6, 19,
14, 3, 5, 4, 15, 62, 15, 21, 16, 12, 3, 26, 4, 5, 41, 18, 85, 6, 8, 8, 11, 92, 12, 21, 36, 8]
from collections import Counter
import matplotlib.pyplot as plt
from pylab import *
friend_counts = Counter(num_friends)
xs = range(max(num_friends))
ys = [friend_counts[x] for x in xs]
plt.bar(xs, ys);
plt.axis([0, 101, 0, 10])
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.title('朋友的数量')
plt.xlabel('朋友个数')
plt.ylabel('人数')
plt.show()
print(len(num_friends)) # 78
print(max(num_friends)) # 100
print(min(num_friends)) # 1
print(sum(num_friends)) # 1441
print(mean(num_friends)) # 18.4743589
print(median(num_friends)) # 12.0
print(sorted(num_friends)[0]) # 1
print(sorted(num_friends)[1]) # 2
print(sorted(num_friends)[-1]) # 100
print(sorted(num_friends)[-2]) # 92

该篇内容涵盖了线性代数的基础,如向量和矩阵;统计学中的描述性统计,包括中心倾向和离散值,以及辛普森悖论的解释;概率论中的条件概率、正态分布和中心极限定理;最后讨论了假设检验和置信区间的概念,通过掷硬币和A/B测试案例进行说明。
最低0.47元/天 解锁文章

被折叠的 条评论
为什么被折叠?



