数据科学包-Day3

最新推荐文章于 2025-02-07 22:32:16 发布

原创最新推荐文章于 2025-02-07 22:32:16 发布 · 536 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #数据分析

本文围绕Python中numpy数组展开，介绍其是科学计算基础库，具有快速、方便的特性。详细阐述了numpy创建数组、数组计算（含与数和数组计算、轴的概念）、数据读取、转置、索引切片、数值修改等操作，还提及nan和inf等特殊值及数组拼接、行列交换等内容。

numpy数组

一个在Python中做科学计算的基础库，重在数值计算，也是大部分Python科学数据库的基础库，多用于在大型、多维数组上执行数值运算。

特性:

快速
方便
科学计算的基础库

numpy创建数组（矩阵）

#numpy创建数组（矩阵）
import numpy as np
import random

#使用numpy生成数组,得到ndarray的类型
t1=np.array([1,2,3])
print(t1)
print(type(t1))

t2=np.array(range(10))
print(t2)
print(type(t2))

t3=np.arange(4,10,2)
print(t3)
print(type(t3))
print(t3.dtype)

t4=np.array(range(1,4),dtype="i1")
print(t4)
print(type(t4))
print(t4.dtype)

#numpy中的bool类型
t5=np.array([1,1,0,1,0,0],dtype=bool)
print(t5)
print(type(t5))
print(t5.dtype)

#调整数据类型
t6=t5.astype("int8")
print(t6)
print(type(t6))
print(t6.dtype)

#numpy中的小数
t7=np.array([random.random() for i in range(10)])
#random.random()方法返回一个随机数，其在0至1的范围之内
print(t7)
print(t7.dtype)

t8=np.round(t7,3)
#round() 方法返回浮点数x的四舍五入值。
print(t8)

[1 2 3]
<class 'numpy.ndarray'>
[0 1 2 3 4 5 6 7 8 9]
<class 'numpy.ndarray'>
[4 6 8]
<class 'numpy.ndarray'>
int32
[1 2 3]
<class 'numpy.ndarray'>
int8
[ True  True False  True False False]
<class 'numpy.ndarray'>
bool
[1 1 0 1 0 0]
<class 'numpy.ndarray'>
int8
[0.99759012 0.55395501 0.80944827 0.10235851 0.96164576 0.56108606
 0.41564544 0.80127915 0.7915112  0.97211814]
float64
[0.998 0.554 0.809 0.102 0.962 0.561 0.416 0.801 0.792 0.972]

在这里插入图片描述

numpy数组的计算

 import numpy as np
>>> t1=np.arange(12)
>>> t1
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
>>> t1.reshape((3,4))
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
>>> t5=np.arange(24)
>>> t5.reshape((2,3,4))
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])
>>> t5.reshape((4,6))
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> t6=np.arange(1,25).reshape(2,3,4)
>>> t6
array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])
>>> t6.flatten()
#flatten()函数用法
#flatten是numpy.ndarray.flatten的一个函数，即返回一个一维数组。
#flatten只能适用于numpy对象，即array或者mat，普通的list列表不适用！
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,18, 19, 20, 21, 22, 23, 24])

数组和数的计算

>>> t6+2
array([[[ 3,  4,  5,  6],
        [ 7,  8,  9, 10],
        [11, 12, 13, 14]],

       [[15, 16, 17, 18],
        [19, 20, 21, 22],
        [23, 24, 25, 26]]])

数组和数组的计算

>>> a=np.array([[3,4,5,6,7,8],[4,5,6,7,8,9]])
>>> a
array([[3, 4, 5, 6, 7, 8],
       [4, 5, 6, 7, 8, 9]])
>>> a+1
array([[ 4,  5,  6,  7,  8,  9],
       [ 5,  6,  7,  8,  9, 10]])
>>> a*3
array([[ 9, 12, 15, 18, 21, 24],
       [12, 15, 18, 21, 24, 27]])
>>> b=np.array([[21,22,23,24,25,26],[27,28,29,30,31,32]])
>>> b
array([[21, 22, 23, 24, 25, 26],
       [27, 28, 29, 30, 31, 32]])
>>> a+b
array([[24, 26, 28, 30, 32, 34],
       [31, 33, 35, 37, 39, 41]])
>>> a*b
array([[ 63,  88, 115, 144, 175, 208],
       [108, 140, 174, 210, 248, 288]])

轴

在numpy中可以理解为方向，使用0，1，2…数字表示，对于一个一维数组，只有一个0轴，对于2维数组（shape(2,2)），有0轴和1轴，对于三维数组（shape(2,2,3))，有0，1，2轴

二维数组的轴
在这里插入图片描述
三维数组的轴

numpy数据的读取

Csv:Comma-Separated Value,逗号分隔值文件
显示：表格状态
源文件：换行和逗号分隔行列的格式化文本，每一行的数据表示一条记录

在这里插入图片描述

numpy中的转置

转置是一种变换，对于numpy中的数组来说，就是在对角线方向交换数据，目的也是为了更方便的去处理数据

>>> v1=np.arange(24).reshape((4,6))
>>> v1
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> v1.transpose()
array([[ 0,  6, 12, 18],
       [ 1,  7, 13, 19],
       [ 2,  8, 14, 20],
       [ 3,  9, 15, 21],
       [ 4, 10, 16, 22],
       [ 5, 11, 17, 23]])
>>> v1.T
array([[ 0,  6, 12, 18],
       [ 1,  7, 13, 19],
       [ 2,  8, 14, 20],
       [ 3,  9, 15, 21],
       [ 4, 10, 16, 22],
       [ 5, 11, 17, 23]])
>>> v1.swapaxes(1,0)
array([[ 0,  6, 12, 18],
       [ 1,  7, 13, 19],
       [ 2,  8, 14, 20],
       [ 3,  9, 15, 21],
       [ 4, 10, 16, 22],
       [ 5, 11, 17, 23]])

numpy中索引和切片

import numpy as np
#寻找路径
us_file_path="./youtube_video_data/US_video_data_numbers.csv"
uk_file_path="./youtube_video_data/GB_video_data_numbers.csv"

#t1 = np.loadtxt(us_file_path,delimiter=",",dtype="int",unpack=True)
#print(t1)
t2 = np.loadtxt(us_file_path,delimiter=",",dtype="int")
print(t2)
print("*"*100)
#取行
#print(t2[2])

#取连续的多行
#到第三行截止
#print(t2[:2])

#取不连续的多行；注意方括号的使用
#取第3，9，11行数据
#print(t2[[2,8,10]])

#取列
#print(t2[1,:]) # :表示每列都要
#print("*"*100)
#print(t2[2:,:])
#print("*"*100)
#取3，9，4行每列数据
#print(t2[[2,8,3],:])
#取第一列得数据
#print("*"*100)
#print(t2[:,0])

#取连续的多列
#print(t2[:,2:])

#取不连续的多列
#print(t2[:,[0,3]])

#取多行和多列，取第三行，第四列的值
#a=t2[2,3]
#print(a)
#print(type(a))
#170708
#<class 'numpy.int32'>

#取多行和多列，取第3-5行，第2-4列的值
#取的是行和列交叉点的位置
#b=t2[2:5,1:4]
#print(b)
#[[576597  39774 170708]
#[ 24975   4542  12829]
#[ 96666    568   6666]]

#取多个不相邻的点  此处应该注意
#选出来的结果是（0，0） （2，1） （2，3）
#a=t2[[0,2,2],[0,1,3]]
#print(a)
#[4394029  576597  170708]

numpy数值的修改

>>> v1
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> v1[:,2:4]
array([[ 2,  3],
       [ 8,  9],
       [14, 15],
       [20, 21]])
>>> v1[:,2:4]=0
>>> v1
array([[ 0,  1,  0,  0,  4,  5],
       [ 6,  7,  0,  0, 10, 11],
       [12, 13,  0,  0, 16, 17],
       [18, 19,  0,  0, 22, 23]])

numpy中bool索引

>>> q=np.arange(24).reshape((4,6))
>>> q
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> q<10
array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

如果想把q中小于10的数字替换成为0，把大于20的替换成为20

>>> q=np.arange(24).reshape((4,6))
>>> q
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> q<10
array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])
>>> q[q<10]=0
>>> q
array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> q[q>20]=20
>>> q
array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 20, 20, 20]])

numpy中三元运算符

>>> t=np.arange(24).reshape((4,6))
>>> t
array([[ 0,  1,  2,  3,  4,  5],
      [ 6,  7,  8,  9, 10, 11],
      [12, 13, 14, 15, 16, 17],
      [18, 19, 20, 21, 22, 23]])
>>> np.where(t<=8,8,25)
array([[ 8,  8,  8,  8,  8,  8],
      [ 8,  8,  8, 25, 25, 25],
      [25, 25, 25, 25, 25, 25],
      [25, 25, 25, 25, 25, 25]])

np.where()中第一个是判断条件，第二个是满足条件的情况下的值，第三个是否定的值。

numpy中的clip(裁剪）

 q
array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 20, 20, 20]])
>>> q.clip(10,12)
array([[10, 10, 10, 10, 10, 10],
       [10, 10, 10, 10, 10, 11],
       [12, 12, 12, 12, 12, 12],
       [12, 12, 12, 12, 12, 12]])

小于10的替换为10，大于12的替换为12

numpy中的nan和inf

nan( NAN,Nan):not a number表示不是一个数字

什么时候numpy中会出现nan:

当我们读取本地的文件为float的时候，如果有缺失，就会出现nan
当做了一个不合适的计算的时候（比如无穷大（inf）减去无穷大）
inf表示正无穷，-inf表示负无穷

>>> a=np.inf
>>> type(a)
<class 'float'>
>>> a=np.nan
>>> type(a)
<class 'float'>

numpy中的nan的注意点

两个nan是不相等的

>>> np.nan==np.nan
False

np,nan!=np.nan
利用以上特性，判断数组中nan的个数

import numpy as np
t=np.arange(24).reshape((4,6))
t=t.astype(float)
print(t)
t[3,3]=np.nan
print(t)
print(np.count_nonzero(t!=t))

[[ 0.  1.  2.  3.  4.  5.]
[ 6.  7.  8.  9. 10. 11.]
[12. 13. 14. 15. 16. 17.]
[18. 19. 20. 21. 22. 23.]]
[[ 0.  1.  2.  3.  4.  5.]
[ 6.  7.  8.  9. 10. 11.]
[12. 13. 14. 15. 16. 17.]
[18. 19. 20. nan 22. 23.]]
1

通过np.isnan(a)来判断，返回bool类型比如希望把nan替换为0

t[np.isnan(t)]=0
print(t)

[[ 0.  1.  2.  3.  4.  5.]
[ 6.  7.  8.  9. 10. 11.]
[12. 13. 14. 15. 16. 17.]
[18. 19. 20.  0. 22. 23.]]

nan和任何值计算都为nan

数组的拼接

>>> import numpy as np
>>> t1=np.arange(12).reshape(2,6)
>>> t1
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])
>>> t2=np.arange(12,24).reshape(2,6)
>>> t2
array([[12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> np.vstack((t1,t2)) #竖直拼接
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
>>> np.hstack((t1,t2)) #水平拼接 
array([[ 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17],
       [ 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23]])

数组的行列交换

>>> t1=np.arange(12,24).reshape(3,4)
>>> t1
array([[12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])
>>> t1[[1,2],:]=t1[[2,1],:]	#行交换
>>> t1
array([[12, 13, 14, 15],
       [20, 21, 22, 23],
       [16, 17, 18, 19]])
>>> t1[:,[0,2]]=t1[:,[2,0]]
>>> t1
array([[14, 13, 12, 15],
       [22, 21, 20, 23],
       [18, 17, 16, 19]])

练习

import numpy as np
import os
#加载国家数据
print(os.getcwd())
us_date=os.getcwd()+'/youtube_video_data/US_video_data_numbers.csv'
uk_date=os.getcwd()+'/youtube_video_data/GB_video_data_numbers.csv'
us_date=np.loadtxt(us_date,delimiter=",",dtype=int)
uk_date=np.loadtxt(uk_date,delimiter=",",dtype=int)
#添加国家信息
#构造全为0的数据
zero_data=np.zeros((us_date.shape[0],1)).astype(int) #[0]表示一行，1表示1列
one_data=np.ones((uk_date.shape[0],1)).astype(int)  #[0]表示一行，1表示1列

#分别添加一列全为0，1的数组
us_date=np.hstack((us_date,zero_data))

uk_date=np.hstack((uk_date,one_data))

#拼接两组数据
final_data=np.vstack((us_date,uk_date))
print(final_data)

C:\Users\ramon\.PyCharmCE2017.2\config\scratches\1-7
[[4394029  320053    5931   46245       0]
 [7860119  185853   26679       0       0]
 [5845909  576597   39774  170708       0]
 ...
 [ 109222    4840      35     212       1]
 [ 626223   22962     532    1559       1]
 [  99228    1699      23     135       1]]

numpy更多好用的方法

获取最大值最小值的位置
- np.argmax(t,axis=0)
- np.argmin(t,axis=1)

>>> a=np.arange(6).reshape(2,3)
>>> a
array([[0, 1, 2],
       [3, 4, 5]])
>>> np.argmin(a,axis=0)	#axis=0按每列来计算
array([0, 0, 0], dtype=int64)
>>> np.argmin(a,axis=1)	#axis=1按每行来计算
array([0, 0], dtype=int64)
>>> np.argmax(a,axis=0)
array([1, 1, 1], dtype=int64)
>>> np.argmax(a,axis=1)
array([2, 2], dtype=int64)

创建一个全0的数组：np.zeros((3,4))

>>> np.zeros((3,4))
array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

创建一个全1的数组

>>> np.ones((3,4))
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

创建一个对角线为1的正方形数组（方阵）

>>> np.eye(6)
array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

numpy生成随机数

import numpy as np
np.random.seed(10)  #加上种子以后随机一次后会固定数值，不会再变化
t=np.random.randint(0,20,(3,4))
print(t)

[[ 9  4 15  0]
 [17 16 17  8]
 [ 9  0 10  8]]

在这里插入图片描述

numpy均值的算法

import numpy as np
def fill_ndarray(t1):
    for i in range(t1.shape[1]):    #遍历每一列
        temp_col=t1[:,i]    #当前的一列
        nan_num=np.count_nonzero(temp_col != temp_col)
        if nan_num!=0:  #不为0，说明当前这一列中有nan
            temp_not_nan_col=temp_col[temp_col==temp_col]   #当前一列不为nan的array
            #选中当前为nan的位置，把值赋给为不为nan的均值
            temp_col[np.isnan(temp_col)] = temp_not_nan_col.mean()
    return t1
if __name__=='__main__':
    t1 = np.arange(12).reshape((3, 4)).astype("float")
    t1[1,2:]=np.nan
    print(t1)
    t1=fill_ndarray(t1)
    print(t1)

[[ 0.  1.  2.  3.]
 [ 4.  5. nan nan]
 [ 8.  9. 10. 11.]]
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]

练习

在这里插入图片描述

import os
from matplotlib import pyplot as plt
import numpy as np
us_data=os.getcwd()+'/youtube_video_data/US_video_data_numbers.csv'
uk_data=os.getcwd()+'/youtube_video_data/GB_video_data_numbers.csv'
#文件读取
t_us=np.loadtxt(us_data,delimiter=",",dtype=int)
t_uk=np.loadtxt(uk_data,delimiter=",",dtype=int)
#print(us_data)
#取评论的数据
t_us_comments = t_us[:,-1]
print(t_us_comments.max(),t_us_comments.min())
#选择比5000小的数据
t_us_comments=t_us_comments[t_us_comments<=5000]
#绘制直方图
d=25
bin_nums=(t_us_comments.max()-t_us_comments.min())//d
plt.figure(figsize=(20,10),dpi=100)
plt.hist(t_us_comments,bin_nums,color="red")
plt.grid(alpha=0.8)
plt.show()

在这里插入图片描述

import os
from matplotlib import pyplot as plt
import numpy as np
#us_data=os.getcwd()+'/youtube_video_data/US_video_data_numbers.csv'
uk_data=os.getcwd()+'/youtube_video_data/GB_video_data_numbers.csv'
#文件读取
#t_us=np.loadtxt(us_data,delimiter=",",dtype=int)
t_uk=np.loadtxt(uk_data,delimiter=",",dtype=int)

t_uk=t_uk[t_uk[:,1]<=500000]
t_uk_comment=t_uk[:,-1]
t_uk_like=t_uk[:,1]


plt.figure(figsize=(20,10),dpi=100)
plt.scatter(t_uk_like,t_uk_comment,color="orange")
plt.show()