Python数据分析_第06课:数据清洗与初步分析_笔记

这篇博客详细介绍了Python数据分析中数据清洗的过程,包括缺失值处理的拉格朗日插值法、数据合并(如dataframe合并、索引合并、轴向连接等)、数据标准化方法(如最小-最大规范化、零-均值规范化)以及异常值检测和过滤。还讨论了如何利用pandas库进行数据转换、字符串操作和正则表达式应用,最后通过USDA食品数据库的示例进行了实践应用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


GitHub: https://github.com/RealEmperor/Python-for-Data-Analysis

缺失值处理——拉格朗日插值法

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from scipy.interpolate import lagrange  # 导入拉格朗日插值函数

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

inputfile = 'data/catering_sale.xls'  # 销量数据路径
outputfile = 'data/sales.xls'  # 输出数据路径

data = pd.read_excel(inputfile)  # 读入数据

# 过滤异常值,将其变为空值
"""
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None  
上面这样写会有警告:
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

如果要更改原始数据,请使用单一赋值操作(loc):
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None

如果想要一个副本,请确保强制让 Pandas 创建副本:
error_data = data.copy() 
error_data.loc[(error_data[u'销量'] < 400) | (error_data[u'销量'] > 5000), u'销量'] = None

参考:https://www.jianshu.com/p/72274ccb647a
"""
data.loc[(data[u'销量'] < 400) | (data[u'销量'] > 5000), u'销量'] = None


# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
    y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]  # 取数
    y = y[y.notnull()]  # 剔除空值
    return lagrange(y.index, list(y))(n)  # 插值并返回插值结果


# 逐个元素判断是否需要插值
for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:  # 如果为空即插值。
            """
            data[i][j] = ployinterp_column(data[i], j)  
            这样写会有警告:
            SettingWithCopyWarning: 
            A value is trying to be set on a copy of a slice from a DataFrame
            """
            data.loc[j, i] = ployinterp_column(data[i], j)

data.to_excel(outputfile)  # 输出结果,写入文件

dataframe合并

#dataframe合并
# 1
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})
print(df1)
print(df2)
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d
pd.merge(df1, df2)
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
pd.merge(df1, df2, on='key')
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
# 2
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
data1lkeydata2rkey
00b1b
11b1b
26b1b
32a0a
44a0a
55a0a
pd.merge(df1, df2, how='outer')
data1keydata2
00.0b1.0
11.0b1.0
26.0b1.0
32.0a0.0
44.0a0.0
55.0a0.0
63.0cNaN
7NaNd2.0
# 3
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                 'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                 'data2': range(5)})
print(df1)
print(df2)
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
   data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d
pd.merge(df1, df2, on='key', how='left')
data1keydata2
00b1.0
10b3.0
21b1.0
31b3.0
42a0.0
52a2.0
63cNaN
74a0.0
84a2.0
95b1.0
105b3.0
pd.merge(df1, df2, how='inner')
data1keydata2
00b1
10b3
21b1
31b3
45b1
55b3
62a0
72a2
84a0
94a2
# 4
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
# 5
pd.merge(left, right, on='key1')
key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

索引上的合并

# 1
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)
  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0
pd.merge(left1, right1, left_on='key', right_index=True)
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN
# 2
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])
print(lefth)
print(righth)
   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
datakey1key2event1event2
00.0Ohio200045
00.0Ohio200067
11.0Ohio200189
22.0Ohio20021011
33.0Nevada200101
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')
datakey1key2event1event2
00.0Ohio20004.05.0
00.0Ohio20006.07.0
11.0Ohio20018.09.0
22.0Ohio200210.011.0
33.0Nevada20010.01.0
44.0Nevada2002NaNNaN
4NaNNevada20002.03.0
left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)
   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
# 3
left2.join(right2, how='outer')
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left1.join(right1, on='key')
keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN
# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
left2.join([right2, another], how='outer')
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
bNaNNaN7.08.0NaNNaN
c3.04.09.010.09.010.0
dNaNNaN11.012.0NaNNaN
e5.06.013.014.011.012.0
fNaNNaNNaNNaN16.017.0

轴向连接

# 1
arr = np.arange(12).reshape((3, 4))
print(arr)

np.concatenate([arr, arr], axis=1)
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]





array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
# 2
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
pd.concat([s1, s2, s3], axis=0)
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
pd.concat([s1, s2, s3], axis=1)
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
s4 = pd.concat([s1 * 5, s3])
print(s4)
a    0
b    5
f    5
g    6
dtype: int64
pd.concat([s1, s4], axis=1)
01
a0.00
b1.05
fNaN5
gNaN6
pd.concat([s1, s4], axis=1, join='inner')
01
a00
b15
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])
01
a0.00.0
cNaNNaN
b1.05.0
eNaNNaN
# 3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print(result)
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
result.unstack()
abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
# 4
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat({'level1': df1, 'level2': df2}, axis=1)
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower'])
upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
# 5
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)
          a         b         c         d
0 -0.204708  0.478943 -0.519439 -0.555730
1  1.965781  1.393406  0.092908  0.281746
2  0.769023  1.246435  1.007189 -1.296221
          b         d         a
0  0.274992  0.228913  1.352917
1  0.886429 -2.001637 -0.371843
pd.concat([df1, df2], ignore_index=True)
abcd
0-0.2047080.478943-0.519439-0.555730
11.9657811.3934060.0929080.281746
20.7690231.2464351.007189-1.296221
31.3529170.274992NaN0.228913
4-0.3718430.886429NaN-2.001637

合并重叠数据

# 1
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print(a)
print(b)

np.where(pd.isnull(a), b, a)
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64





array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])
# 2
b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
# 3
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN

重塑层次化索引

# 1
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
# stack 把 column 转成 index
result = data.stack()
print(result)
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

在这里插入图片描述

# unstack 把 index 转成 column
result.unstack()
numberonetwothree
state
Ohio012
Colorado345

在这里插入图片描述

result.unstack(0)
stateOhioColorado
number
one03
two14
three25
result.unstack('state')
stateOhioColorado
number
one03
two14
three25
# 2
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print(data2)
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64
# data2.stack() # 没有column 执行会报错
data2.unstack()
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0

在这里插入图片描述

data2.unstack().stack()
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

在这里插入图片描述

data2.unstack().stack(dropna=False)
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64
# 3
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
print(df)
side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10
df.unstack('state')
sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
df.unstack('state').stack('side')
stateColoradoOhio
numberside
oneleft30
right85
twoleft41
right96
threeleft52
right107

长宽格式的转换

data = pd.read_csv('data/macrodata.csv')
print(data)
       year  quarter    realgdp  realcons   realinv  realgovt  realdpi  \
0    1959.0      1.0   2710.349    1707.4   286.898   470.045   1886.9   
1    1959.0      2.0   2778.801    1733.7   310.859   481.301   1919.7   
2    1959.0      3.0   2775.488    1751.8   289.226   491.260   1916.4   
3    1959.0      4.0   2785.204    1753.7   299.356   484.052   1931.3   
4    1960.0      1.0   2847.699    1770.5   331.722   462.199   1955.5   
5    1960.0      2.0   2834.390    1792.9   298.152   460.400   1966.1   
6    1960.0      3.0   2839.022    1785.8   296.375   474.676   1967.8   
7    1960.0      4.0   2802.616    1788.2   259.764   476.434   1966.6   
8    1961.0      1.0   2819.264    1787.7   266.405   475.854   1984.5   
9    1961.0      2.0   2872.005    1814.3   286.246   480.328   2014.4   
10   1961.0      3.0   2918.419    1823.1   310.227   493.828   2041.9   
11   1961.0      4.0   2977.830    1859.6   315.463   502.521   2082.0   
12   1962.0      1.0   3031.241    1879.4   334.271   520.960   2101.7   
13   1962.0      2.0   3064.709    1902.5   331.039   523.066   2125.2   
14   1962.0      3.0   3093.047    1917.9   336.962   538.838   2137.0   
15   1962.0      4.0   3100.563    1945.1   325.650   535.912   2154.6   
16   1963.0      1.0   3141.087    1958.2   343.721   522.917   2172.5   
17   1963.0      2.0   3180.447    1976.9   348.730   518.108   2193.1   
18   1963.0      3.0   3240.332    2003.8   360.102   546.893   2217.9   
19   1963.0      4.0   3264.967    2020.6   364.534   532.383   2254.6   
20   1964.0      1.0   3338.246    2060.5   379.523   529.686   2299.6   
21   1964.0      2.0   3376.587    2096.7   377.778   526.175   2362.1   
22   1964.0      3.0   3422.469    2135.2   386.754   522.008   2392.7   
23   1964.0      4.0   3431.957    2141.2   389.910   514.603   2420.4   
24   1965.0      1.0   3516.251    2188.8   429.145   508.006   2447.4   
25   1965.0      2.0   3563.960    2213.0   429.119   508.931   2474.5   
26   1965.0      3.0   3636.285    2251.0   444.444   529.446   2542.6   
27   1965.0      4.0   3724.014    2314.3   446.493   544.121   2594.1   
28   1966.0      1.0   3815.423    2348.5   484.244   556.593   2618.4   
29   1966.0      2.0   3828.124    2354.5   475.408   571.371   2624.7   
..      ...      ...        ...       ...       ...       ...      ...   
173  2002.0      2.0  11538.770    7997.8  1810.779   774.408   8658.9   
174  2002.0      3.0  11596.430    8052.0  1814.531   786.673   8629.2   
175  2002.0      4.0  11598.824    8080.6  1813.219   799.967   8649.6   
176  2003.0      1.0  11645.819    8122.3  1813.141   800.196   8681.3   
177  2003.0      2.0  11738.706    8197.8  1823.698   838.775   8812.5   
178  2003.0      3.0  11935.461    8312.1  1889.883   839.598   8935.4   
179  2003.0      4.0  12042.817    8358.0  1959.783   845.722   8986.4   
180  2004.0      1.0  12127.623    8437.6  1970.015   856.570   9025.9   
181  2004.0      2.0  12213.818    8483.2  2055.580   861.440   9115.0   
182  2004.0      3.0  12303.533    8555.8  2082.231   876.385   9175.9   
183  2004.0      4.0  12410.282    8654.2  2125.152   865.596   9303.4   
184  2005.0      1.0  12534.113    8719.0  2170.299   869.204   9189.6   
185  2005.0      2.0  12587.535    8802.9  2131.468   870.044   9253.0   
186  2005.0      3.0  12683.153    8865.6  2154.949   890.394   9308.0   
187  2005.0      4.0  12748.699    8888.5  2232.193   875.557   9358.7   
188  2006.0      1.0  12915.938    8986.6  2264.721   900.511   9533.8   
189  2006.0      2.0  12962.462    9035.0  2261.247   892.839   9617.3   
190  2006.0      3.0  12965.916    9090.7  2229.636   892.002   9662.5   
191  2006.0      4.0  13060.679    9181.6  2165.966   894.404   9788.8   
192  2007.0      1.0  13099.901    9265.1  2132.609   882.766   9830.2   
193  2007.0      2.0  13203.977    9291.5  2162.214   898.713   9842.7   
194  2007.0      3.0  13321.109    9335.6  2166.491   918.983   9883.9   
195  2007.0      4.0  13391.249    9363.6  2123.426   925.110   9886.2   
196  2008.0      1.0  13366.865    9349.6  2082.886   943.372   9826.8   
197  2008.0      2.0  13415.266    9351.0  2026.518   961.280  10059.0   
198  2008.0      3.0  13324.600    9267.7  1990.693   991.551   9838.3   
199  2008.0      4.0  13141.920    9195.3  1857.661  1007.273   9920.4   
200  2009.0      1.0  12925.410    9209.2  1558.494   996.287   9926.4   
201  2009.0      2.0  12901.504    9189.0  1456.678  1023.528  10077.5   
202  2009.0      3.0  12990.341    9256.0  1486.398  1044.088  10040.6   

         cpi      m1  tbilrate  unemp      pop  infl  realint  
0     28.980   139.7      2.82    5.8  177.146  0.00     0.00  
1     29.150   141.7      3.08    5.1  177.830  2.34     0.74  
2     29.350   140.5      3.82    5.3  178.657  2.74     1.09  
3     29.370   140.0      4.33    5.6  179.386  0.27     4.06  
4     29.540   139.6      3.50    5.2  180.007  2.31     1.19  
5     29.550   140.2      2.68    5.2  180.671  0.14     2.55  
6     29.750   140.9      2.36    5.6  181.528  2.70    -0.34  
7     29.840   141.1      2.29    6.3  182.287  1.21     1.08  
8     29.810   142.1      2.37    6.8  182.992 -0.40     2.77  
9     29.920   142.9      2.29    7.0  183.691  1.47     0.81  
10    29.980   144.1      2.32    6.8  184.524  0.80     1.52  
11    30.040   145.2      2.60    6.2  185.242  0.80     1.80  
12    30.210   146.4      2.73    5.6  185.874  2.26     0.47  
13    30.220   146.5      2.78    5.5  186.538  0.13     2.65  
14    30.380   146.7      2.78    5.6  187.323  2.11     0.67  
15    30.440   148.3      2.87    5.5  188.013  0.79     2.08  
16    30.480   149.7      2.90    5.8  188.580  0.53     2.38  
17    30.690   151.3      3.03    5.7  189.242  2.75     0.29  
18    30.750   152.6      3.38    5.5  190.028  0.78     2.60  
19    30.940   153.7      3.52    5.6  190.668  2.46     1.06  
20    30.950   154.8      3.51    5.5  191.245  0.13     3.38  
21    31.020   156.8      3.47    5.2  191.889  0.90     2.57  
22    31.120   159.2      3.53    5.0  192.631  1.29     2.25  
23    31.280   160.7      3.76    5.0  193.223  2.05     1.71  
24    31.380   162.0      3.93    4.9  193.709  1.28     2.65  
25    31.580   163.1      3.84    4.7  194.303  2.54     1.30  
26    31.650   166.0      3.93    4.4  194.997  0.89     3.04  
27    31.880   169.1      4.35    4.1  195.539  2.90     1.46  
28    32.280   171.8      4.62    3.9  195.999  4.99    -0.37  
29    32.450   170.3      4.65    3.8  196.560  2.10     2.55  
..       ...     ...       ...    ...      ...   ...      ...  
173  180.000  1199.5      1.70    5.8  288.028  1.56     0.14  
174  181.200  1204.0      1.61    5.7  288.783  2.66    -1.05  
175  182.600  1226.8      1.20    5.8  289.421  3.08    -1.88  
176  183.200  1248.4      1.14    5.9  290.019  1.31    -0.17  
177  183.700  1287.9      0.96    6.2  290.704  1.09    -0.13  
178  184.900  1297.3      0.94    6.1  291.449  2.60    -1.67  
179  186.300  1306.1      0.90    5.8  292.057  3.02    -2.11  
180  187.400  1332.1      0.94    5.7  292.635  2.35    -1.42  
181  189.100  1340.5      1.21    5.6  293.310  3.61    -2.41  
182  190.800  1361.0      1.63    5.4  294.066  3.58    -1.95  
183  191.800  1366.6      2.20    5.4  294.741  2.09     0.11  
184  193.800  1357.8      2.69    5.3  295.308  4.15    -1.46  
185  194.700  1366.6      3.01    5.1  295.994  1.85     1.16  
186  199.200  1375.0      3.52    5.0  296.770  9.14    -5.62  
187  199.400  1380.6      4.00    4.9  297.435  0.40     3.60  
188  200.700  1380.5      4.51    4.7  298.061  2.60     1.91  
189  202.700  1369.2      4.82    4.7  298.766  3.97     0.85  
190  201.900  1369.4      4.90    4.7  299.593 -1.58     6.48  
191  203.574  1373.6      4.92    4.4  300.320  3.30     1.62  
192  205.920  1379.7      4.95    4.5  300.977  4.58     0.36  
193  207.338  1370.0      4.72    4.5  301.714  2.75     1.97  
194  209.133  1379.2      4.00    4.7  302.509  3.45     0.55  
195  212.495  1377.4      3.01    4.8  303.204  6.38    -3.37  
196  213.997  1384.0      1.56    4.9  303.803  2.82    -1.26  
197  218.610  1409.3      1.74    5.4  304.483  8.53    -6.79  
198  216.889  1474.7      1.17    6.0  305.270 -3.16     4.33  
199  212.174  1576.5      0.12    6.9  305.952 -8.79     8.91  
200  212.671  1592.8      0.22    8.1  306.547  0.94    -0.71  
201  214.469  1653.6      0.18    9.2  307.226  3.37    -3.19  
202  216.385  1673.9      0.12    9.6  308.013  3.56    -3.44  

[203 rows x 14 columns]
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
print(periods)
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))
print(data)
item          realgdp  infl  unemp
date                              
1959-03-31   2710.349  0.00    5.8
1959-06-30   2778.801  2.34    5.1
1959-09-30   2775.488  2.74    5.3
1959-12-31   2785.204  0.27    5.6
1960-03-31   2847.699  2.31    5.2
1960-06-30   2834.390  0.14    5.2
1960-09-30   2839.022  2.70    5.6
1960-12-31   2802.616  1.21    6.3
1961-03-31   2819.264 -0.40    6.8
1961-06-30   2872.005  1.47    7.0
1961-09-30   2918.419  0.80    6.8
1961-12-31   2977.830  0.80    6.2
1962-03-31   3031.241  2.26    5.6
1962-06-30   3064.709  0.13    5.5
1962-09-30   3093.047  2.11    5.6
1962-12-31   3100.563  0.79    5.5
1963-03-31   3141.087  0.53    5.8
1963-06-30   3180.447  2.75    5.7
1963-09-30   3240.332  0.78    5.5
1963-12-31   3264.967  2.46    5.6
1964-03-31   3338.246  0.13    5.5
1964-06-30   3376.587  0.90    5.2
1964-09-30   3422.469  1.29    5.0
1964-12-31   3431.957  2.05    5.0
1965-03-31   3516.251  1.28    4.9
1965-06-30   3563.960  2.54    4.7
1965-09-30   3636.285  0.89    4.4
1965-12-31   3724.014  2.90    4.1
1966-03-31   3815.423  4.99    3.9
1966-06-30   3828.124  2.10    3.8
...               ...   ...    ...
2002-06-30  11538.770  1.56    5.8
2002-09-30  11596.430  2.66    5.7
2002-12-31  11598.824  3.08    5.8
2003-03-31  11645.819  1.31    5.9
2003-06-30  11738.706  1.09    6.2
2003-09-30  11935.461  2.60    6.1
2003-12-31  12042.817  3.02    5.8
2004-03-31  12127.623  2.35    5.7
2004-06-30  12213.818  3.61    5.6
2004-09-30  12303.533  3.58    5.4
2004-12-31  12410.282  2.09    5.4
2005-03-31  12534.113  4.15    5.3
2005-06-30  12587.535  1.85    5.1
2005-09-30  12683.153  9.14    5.0
2005-12-31  12748.699  0.40    4.9
2006-03-31  12915.938  2.60    4.7
2006-06-30  12962.462  3.97    4.7
2006-09-30  12965.916 -1.58    4.7
2006-12-31  13060.679  3.30    4.4
2007-03-31  13099.901  4.58    4.5
2007-06-30  13203.977  2.75    4.5
2007-09-30  13321.109  3.45    4.7
2007-12-31  13391.249  6.38    4.8
2008-03-31  13366.865  2.82    4.9
2008-06-30  13415.266  8.53    5.4
2008-09-30  13324.600 -3.16    6.0
2008-12-31  13141.920 -8.79    6.9
2009-03-31  12925.410  0.94    8.1
2009-06-30  12901.504  3.37    9.2
2009-09-30  12990.341  3.56    9.6

[203 rows x 3 columns]
ldata = data.stack().reset_index().rename(columns={0: 'value'})
print(ldata)
          date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31     infl      0.000
2   1959-03-31    unemp      5.800
3   1959-06-30  realgdp   2778.801
4   1959-06-30     infl      2.340
5   1959-06-30    unemp      5.100
6   1959-09-30  realgdp   2775.488
7   1959-09-30     infl      2.740
8   1959-09-30    unemp      5.300
9   1959-12-31  realgdp   2785.204
10  1959-12-31     infl      0.270
11  1959-12-31    unemp      5.600
12  1960-03-31  realgdp   2847.699
13  1960-03-31     infl      2.310
14  1960-03-31    unemp      5.200
15  1960-06-30  realgdp   2834.390
16  1960-06-30     infl      0.140
17  1960-06-30    unemp      5.200
18  1960-09-30  realgdp   2839.022
19  1960-09-30     infl      2.700
20  1960-09-30    unemp      5.600
21  1960-12-31  realgdp   2802.616
22  1960-12-31     infl      1.210
23  1960-12-31    unemp      6.300
24  1961-03-31  realgdp   2819.264
25  1961-03-31     infl     -0.400
26  1961-03-31    unemp      6.800
27  1961-06-30  realgdp   2872.005
28  1961-06-30     infl      1.470
29  1961-06-30    unemp      7.000
..         ...      ...        ...
579 2007-06-30  realgdp  13203.977
580 2007-06-30     infl      2.750
581 2007-06-30    unemp      4.500
582 2007-09-30  realgdp  13321.109
583 2007-09-30     infl      3.450
584 2007-09-30    unemp      4.700
585 2007-12-31  realgdp  13391.249
586 2007-12-31     infl      6.380
587 2007-12-31    unemp      4.800
588 2008-03-31  realgdp  13366.865
589 2008-03-31     infl      2.820
590 2008-03-31    unemp      4.900
591 2008-06-30  realgdp  13415.266
592 2008-06-30     infl      8.530
593 2008-06-30    unemp      5.400
594 2008-09-30  realgdp  13324.600
595 2008-09-30     infl     -3.160
596 2008-09-30    unemp      6.000
597 2008-12-31  realgdp  13141.920
598 2008-12-31     infl     -8.790
599 2008-12-31    unemp      6.900
600 2009-03-31  realgdp  12925.410
601 2009-03-31     infl      0.940
602 2009-03-31    unemp      8.100
603 2009-06-30  realgdp  12901.504
604 2009-06-30     infl      3.370
605 2009-06-30    unemp      9.200
606 2009-09-30  realgdp  12990.341
607 2009-09-30     infl      3.560
608 2009-09-30    unemp      9.600

[609 rows x 3 columns]
wdata = ldata.pivot(index='date',columns= 'item',values= 'value')
print(wdata)
item        infl    realgdp  unemp
date                              
1959-03-31  0.00   2710.349    5.8
1959-06-30  2.34   2778.801    5.1
1959-09-30  2.74   2775.488    5.3
1959-12-31  0.27   2785.204    5.6
1960-03-31  2.31   2847.699    5.2
1960-06-30  0.14   2834.390    5.2
1960-09-30  2.70   2839.022    5.6
1960-12-31  1.21   2802.616    6.3
1961-03-31 -0.40   2819.264    6.8
1961-06-30  1.47   2872.005    7.0
1961-09-30  0.80   2918.419    6.8
1961-12-31  0.80   2977.830    6.2
1962-03-31  2.26   3031.241    5.6
1962-06-30  0.13   3064.709    5.5
1962-09-30  2.11   3093.047    5.6
1962-12-31  0.79   3100.563    5.5
1963-03-31  0.53   3141.087    5.8
1963-06-30  2.75   3180.447    5.7
1963-09-30  0.78   3240.332    5.5
1963-12-31  2.46   3264.967    5.6
1964-03-31  0.13   3338.246    5.5
1964-06-30  0.90   3376.587    5.2
1964-09-30  1.29   3422.469    5.0
1964-12-31  2.05   3431.957    5.0
1965-03-31  1.28   3516.251    4.9
1965-06-30  2.54   3563.960    4.7
1965-09-30  0.89   3636.285    4.4
1965-12-31  2.90   3724.014    4.1
1966-03-31  4.99   3815.423    3.9
1966-06-30  2.10   3828.124    3.8
...          ...        ...    ...
2002-06-30  1.56  11538.770    5.8
2002-09-30  2.66  11596.430    5.7
2002-12-31  3.08  11598.824    5.8
2003-03-31  1.31  11645.819    5.9
2003-06-30  1.09  11738.706    6.2
2003-09-30  2.60  11935.461    6.1
2003-12-31  3.02  12042.817    5.8
2004-03-31  2.35  12127.623    5.7
2004-06-30  3.61  12213.818    5.6
2004-09-30  3.58  12303.533    5.4
2004-12-31  2.09  12410.282    5.4
2005-03-31  4.15  12534.113    5.3
2005-06-30  1.85  12587.535    5.1
2005-09-30  9.14  12683.153    5.0
2005-12-31  0.40  12748.699    4.9
2006-03-31  2.60  12915.938    4.7
2006-06-30  3.97  12962.462    4.7
2006-09-30 -1.58  12965.916    4.7
2006-12-31  3.30  13060.679    4.4
2007-03-31  4.58  13099.901    4.5
2007-06-30  2.75  13203.977    4.5
2007-09-30  3.45  13321.109    4.7
2007-12-31  6.38  13391.249    4.8
2008-03-31  2.82  13366.865    4.9
2008-06-30  8.53  13415.266    5.4
2008-09-30 -3.16  13324.600    6.0
2008-12-31 -8.79  13141.920    6.9
2009-03-31  0.94  12925.410    8.1
2009-06-30  3.37  12901.504    9.2
2009-09-30  3.56  12990.341    9.6

[203 rows x 3 columns]
# 2
ldata[:10]
dateitemvalue
01959-03-31realgdp2710.349
11959-03-31infl0.000
21959-03-31unemp5.800
31959-06-30realgdp2778.801
41959-06-30infl2.340
51959-06-30unemp5.100
61959-09-30realgdp2775.488
71959-09-30infl2.740
81959-09-30unemp5.300
91959-12-31realgdp2785.204
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()
iteminflrealgdpunemp
date
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-310.272785.2045.6
1960-03-312.312847.6995.2
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
dateitemvaluevalue2
01959-03-31realgdp2710.3491.669025
11959-03-31infl0.000-0.438570
21959-03-31unemp5.800-0.539741
31959-06-30realgdp2778.8010.476985
41959-06-30infl2.3403.248944
51959-06-30unemp5.100-1.021228
61959-09-30realgdp2775.488-0.577087
71959-09-30infl2.7400.124121
81959-09-30unemp5.3000.302614
91959-12-31realgdp2785.2040.523772
pivoted = ldata.pivot('date', 'item')
pivoted[:5]
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.8-0.4385701.669025-0.539741
1959-06-302.342778.8015.13.2489440.476985-1.021228
1959-09-302.742775.4885.30.124121-0.5770870.302614
1959-12-310.272785.2045.60.0009400.5237721.343810
1960-03-312.312847.6995.2-0.831154-0.713544-2.370232
pivoted['value'][:5]
iteminflrealgdpunemp
date
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-310.272785.2045.6
1960-03-312.312847.6995.2
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.8-0.4385701.669025-0.539741
1959-06-302.342778.8015.13.2489440.476985-1.021228
1959-09-302.742775.4885.30.124121-0.5770870.302614
1959-12-310.272785.2045.60.0009400.5237721.343810
1960-03-312.312847.6995.2-0.831154-0.713544-2.370232
1960-06-300.142834.3905.2-0.860757-1.8607610.560145
1960-09-302.702839.0225.60.119827-1.265934-1.063512

移除重复数据

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
data.duplicated()
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
data.drop_duplicates()
k1k2
0one1
2one2
3two3
5two4
data['v1'] = range(7)
print(data)
    k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6
data.drop_duplicates(['k1'])
k1k2v1
0one10
3two33
data.drop_duplicates(['k1', 'k2'], keep='last')
k1k2v1
1one11
2one22
4two34
6two46

利用函数或映射进行数据转换

# 1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
print(data)
          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
data['food'].map(lambda x: meat_to_animal[x.lower()])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

数据标准化

datafile = 'data/normalization_data.xls'  # 参数初始化
data = pd.read_excel(datafile, header=None)  # 读取数据
print(data)
     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571

最小-最大规范化

(data - data.min()) / (data.max() - data.min())  # 最小-最大规范化
0123
00.0743800.9372910.9235201.000000
10.6198350.0000000.0000000.850941
20.2148760.1195650.8133220.000000
30.0000001.0000001.0000000.563676
41.0000000.9423080.9967110.804149
50.2644630.8386290.8149670.909310
60.6363640.8469900.7861840.929571

零-均值规范化

(data - data.mean()) / data.std()  # 零-均值规范化
0123
0-0.9053830.6358630.4645310.798149
10.604678-1.587675-2.1931670.369390
2-0.516428-1.3040300.147406-2.078279
3-1.1113010.7846280.684625-0.456906
41.6571460.6477650.6751590.234796
5-0.3791500.4018070.1521390.537286
60.6504380.4216420.0693080.595564

小数定标规范化

data / 10 ** np.ceil(np.log10(data.abs().max()))  # 小数定标规范化
0123
00.0780.5210.6020.2863
10.144-0.600-0.5210.2245
20.095-0.4570.468-0.1283
30.0690.5960.6950.1054
40.1900.5270.6910.2051
50.1010.4030.4700.2487
60.1460.4130.4350.2571

替换值

data = Series([1., -999., 2., -999., -1000., 3.])
print(data)
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
data.replace(-999, np.nan)
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
data.replace([-999, -1000], np.nan)
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
data.replace([-999, -1000], [np.nan, 0])
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
data.replace({-999: np.nan, -1000: 0})
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

重命名轴索引

data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
print(data)
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
data.index.map(str.upper)
Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
data.index = data.index.map(str.upper)
print(data)
          one  two  three  four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11
data.rename(index=str.title, columns=str.upper)
ONETWOTHREEFOUR
Ohio0123
Colorado4567
New York891011
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})
onetwopeekaboofour
INDIANA0123
COLORADO4567
NEW YORK891011
# 总是返回DataFrame的引用
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
print(data)
          one  two  three  four
INDIANA     0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11

离散化与面元划分

# 1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(cats)
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats.categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < YoungAdult < Youth]
data = np.random.rand(20)
pd.cut(data, 4, precision=2)
[(0.45, 0.67], (0.23, 0.45], (0.0037, 0.23], (0.45, 0.67], (0.67, 0.9], ..., (0.67, 0.9], (0.0037, 0.23], (0.0037, 0.23], (0.23, 0.45], (0.23, 0.45]]
Length: 20
Categories (4, interval[float64]): [(0.0037, 0.23] < (0.23, 0.45] < (0.45, 0.67] < (0.67, 0.9]]
# 2
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
print(cats)
[(-0.022, 0.641], (-3.746, -0.635], (0.641, 3.26], (-3.746, -0.635], (-0.022, 0.641], ..., (-0.022, 0.641], (0.641, 3.26], (-0.635, -0.022], (0.641, 3.26], (-0.635, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.635] < (-0.635, -0.022] < (-0.022, 0.641] < (0.641, 3.26]]
pd.value_counts(cats)
(0.641, 3.26]       250
(-0.022, 0.641]     250
(-0.635, -0.022]    250
(-3.746, -0.635]    250
dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
[(-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-3.746, -1.266], (-0.022, 1.302], ..., (-0.022, 1.302], (-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-1.266, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.266] < (-1.266, -0.022] < (-0.022, 1.302] < (1.302, 3.26]]

检测和过滤异常值

# 1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676840.0679240.025598-0.002298
std0.9980350.9921061.0068350.996794
min-3.428254-3.548824-3.184377-3.745356
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.3666262.6536563.2603833.927528
col = data[3]
col[np.abs(col) > 3]
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
data[(np.abs(data) > 3).any(1)]
0123
5-0.5397410.4769853.248944-1.021228
97-0.7743630.5529360.1060613.927528
102-0.655054-0.5652303.1768730.959533
305-2.3155550.457246-0.025907-3.399312
3240.0501881.9513123.2603830.963301
4000.1463260.508391-0.196713-3.745356
499-0.293333-0.242459-3.0569901.918403
523-3.428254-0.296336-0.439938-0.867165
5860.2751441.179227-3.1843771.369891
808-0.362528-3.5488241.553205-2.186301
9003.366626-2.3722140.8510101.332846
# 2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676230.0684730.025153-0.002081
std0.9954850.9902531.0039770.989736
min-3.000000-3.000000-3.000000-3.000000
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.0000002.6536563.0000003.000000

排列与随机采样

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print(df)
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
sampler = np.random.permutation(5)
print(sampler)
[1 0 2 3 4]
df.take(sampler)
0123
14567
00123
2891011
312131415
416171819
# 2
df.take(np.random.permutation(len(df))[:3])
0123
14567
312131415
416171819
# 3
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
print(sampler)
[4 4 2 2 2 0 3 0 4 1]
draws = bag.take(sampler)
print(draws)
[ 4  4 -1 -1 -1  5  6  5  4  7]

计算指标与哑变量

# 1
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
print(df)
   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
pd.get_dummies(df['key'])
abc
0010
1010
2100
3001
4100
5010
dummies = pd.get_dummies(df['key'], prefix='key')
print(dummies)
   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0
df_with_dummy = df[['data1']].join(dummies)
print(df_with_dummy)
   data1  key_a  key_b  key_c
0      0      0      1      0
1      1      0      1      0
2      2      1      0      0
3      3      0      0      1
4      4      1      0      0
5      5      0      1      0
# 2
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/movies.dat', sep='::', header=None,
                       names=mnames)
movies[:10]
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  after removing the cwd from sys.path.
movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
56Heat (1995)Action|Crime|Thriller
67Sabrina (1995)Comedy|Romance
78Tom and Huck (1995)Adventure|Children's
89Sudden Death (1995)Action
910GoldenEye (1995)Action|Adventure|Thriller
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
print(genres)
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
print(dummies)
      Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
1        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
2        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
4        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
5        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
6        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
7        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
8        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
9        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
10       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
11       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
12       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
13       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
14       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
15       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
16       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
17       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
18       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
19       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
20       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
21       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
22       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
23       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
24       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
25       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
26       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
27       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
28       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
29       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
...      ...        ...        ...         ...     ...    ...          ...   
3853     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3854     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3855     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3856     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3857     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3858     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3859     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3860     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3861     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3862     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3863     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3864     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3865     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3866     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3867     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3868     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3869     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3870     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3871     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3872     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3873     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3874     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3875     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3876     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3877     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3878     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3879     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3880     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3881     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3882     0.0        0.0        0.0         0.0     0.0    0.0          0.0   

      Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
1       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
2       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
4       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
5       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
6       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
7       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
8       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
9       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
10      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
11      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
12      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
13      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
14      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
15      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
16      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
17      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
18      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
19      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
20      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
21      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
22      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
23      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
24      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
25      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
26      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
27      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
28      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
29      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
...     ...      ...        ...     ...      ...      ...      ...     ...   
3853    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3854    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3855    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3856    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3857    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3858    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3859    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3860    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3861    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3862    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3863    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3864    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3865    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3866    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3867    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3868    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3869    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3870    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3871    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3872    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3873    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3874    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3875    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3876    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3877    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3878    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3879    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3880    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3881    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3882    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   

      Thriller  War  Western  
0          0.0  0.0      0.0  
1          0.0  0.0      0.0  
2          0.0  0.0      0.0  
3          0.0  0.0      0.0  
4          0.0  0.0      0.0  
5          0.0  0.0      0.0  
6          0.0  0.0      0.0  
7          0.0  0.0      0.0  
8          0.0  0.0      0.0  
9          0.0  0.0      0.0  
10         0.0  0.0      0.0  
11         0.0  0.0      0.0  
12         0.0  0.0      0.0  
13         0.0  0.0      0.0  
14         0.0  0.0      0.0  
15         0.0  0.0      0.0  
16         0.0  0.0      0.0  
17         0.0  0.0      0.0  
18         0.0  0.0      0.0  
19         0.0  0.0      0.0  
20         0.0  0.0      0.0  
21         0.0  0.0      0.0  
22         0.0  0.0      0.0  
23         0.0  0.0      0.0  
24         0.0  0.0      0.0  
25         0.0  0.0      0.0  
26         0.0  0.0      0.0  
27         0.0  0.0      0.0  
28         0.0  0.0      0.0  
29         0.0  0.0      0.0  
...        ...  ...      ...  
3853       0.0  0.0      0.0  
3854       0.0  0.0      0.0  
3855       0.0  0.0      0.0  
3856       0.0  0.0      0.0  
3857       0.0  0.0      0.0  
3858       0.0  0.0      0.0  
3859       0.0  0.0      0.0  
3860       0.0  0.0      0.0  
3861       0.0  0.0      0.0  
3862       0.0  0.0      0.0  
3863       0.0  0.0      0.0  
3864       0.0  0.0      0.0  
3865       0.0  0.0      0.0  
3866       0.0  0.0      0.0  
3867       0.0  0.0      0.0  
3868       0.0  0.0      0.0  
3869       0.0  0.0      0.0  
3870       0.0  0.0      0.0  
3871       0.0  0.0      0.0  
3872       0.0  0.0      0.0  
3873       0.0  0.0      0.0  
3874       0.0  0.0      0.0  
3875       0.0  0.0      0.0  
3876       0.0  0.0      0.0  
3877       0.0  0.0      0.0  
3878       0.0  0.0      0.0  
3879       0.0  0.0      0.0  
3880       0.0  0.0      0.0  
3881       0.0  0.0      0.0  
3882       0.0  0.0      0.0  

[3883 rows x 18 columns]
for i, gen in enumerate(movies.genres):
    dummies.loc[i, gen.split('|')] = 1

movies_windic = movies.join(dummies.add_prefix('Genre_'))
print(movies_windic.loc[0])
movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object
# 3
np.random.seed(12345)
values = np.random.rand(10)
print(values)
[ 0.92961609  0.31637555  0.18391881  0.20456028  0.56772503  0.5955447
  0.96451452  0.6531771   0.74890664  0.65356987]
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
(0.0, 0.2](0.2, 0.4](0.4, 0.6](0.6, 0.8](0.8, 1.0]
000001
101000
210000
301000
400100
500100
600001
700010
800010
900010

属性构造

# 参数初始化
inputfile = 'data/electricity_data.xls'  # 供入供出电量数据
outputfile = 'data/electricity_data.xls'  # 属性构造后数据文件

data = pd.read_excel(inputfile)  # 读入数据
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量']) / data[u'供入电量']

data.to_excel(outputfile, index=False)  # 保存结果

字符串对象方法

val = 'a,b,  guido'
val.split(',')
['a', 'b', '  guido']
pieces = [x.strip() for x in val.split(',')]
print(pieces)
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
True
val.index(',')
1
val.find(':')
-1
val.index(':')
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-138-280f8b2856ce> in <module>()
----> 1 val.index(':')


ValueError: substring not found
val.count('a')
1
val.replace(',', '::')
'a::b::  guido'
val.replace(',', '')
'ab  guido'

正则表达式

# 1
import re

text = "foo    bar\t baz  \tqux"
re.split('\s+', text)
['foo', 'bar', 'baz', 'qux']
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
['    ', '\t ', '  \t']
# 2
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE 的作用是使正则表达式对大小写不敏感
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
print(m)
<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
text[m.start():m.end()]
'dave@google.com'
print(regex.match(text))
None
print(regex.sub('REDACTED', text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
# 3
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
regex.findall(text)
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com
# 4
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE | re.VERBOSE)

m = regex.match('wesm@bright.net')
m.groupdict()
{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

pandas中矢量化的字符串函数

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
print(data)
Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object
data.isnull()
Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool
data.str.contains('gmail')
Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object
data.str.findall(pattern, flags=re.IGNORECASE)
Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object
matches = data.str.match(pattern, flags=re.IGNORECASE)
print(matches)
Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object
matches.str.get(1)
Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64
matches.str[0]
Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64
data.str[:5]
Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

示例:USDA食品数据库

'''
{
  "id": 21441,
  "description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
Wing, meat and skin with breading",
  "tags": ["KFC"],
  "manufacturer": "Kentucky Fried Chicken",
  "group": "Fast Foods",
  "portions": [
    {
      "amount": 1,
      "unit": "wing, with skin",
      "grams": 68.0
    },

    ...
  ],
  "nutrients": [
    {
      "value": 20.8,
      "units": "g",
      "description": "Protein",
      "group": "Composition"
    },

    ...
  ]
}
'''

import json

db = json.load(open('data/foods-2011-10-03.json'))
len(db)
6636
db[0].keys()
dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
db[0]['nutrients'][0]
{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
descriptiongroupunitsvalue
0ProteinCompositiong25.18
1Total lipid (fat)Compositiong29.20
2Carbohydrate, by differenceCompositiong3.06
3AshOtherg3.28
4EnergyEnergykcal376.00
5WaterCompositiong39.28
6EnergyEnergykJ1573.00
info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)

info[:5]
descriptiongroupidmanufacturer
0Cheese, carawayDairy and Egg Products1008
1Cheese, cheddarDairy and Egg Products1009
2Cheese, edamDairy and Egg Products1018
3Cheese, fetaDairy and Egg Products1019
4Cheese, mozzarella, part skim milkDairy and Egg Products1028
pd.value_counts(info.group)[:10]
Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64
nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

print(nutrients)
                               description        group    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
389325                        Selenium, Se     Elements      mcg     1.100   
389326                       Vitamin A, IU     Vitamins       IU     5.000   
389327                             Retinol     Vitamins      mcg     0.000   
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
389329                      Carotene, beta     Vitamins      mcg     2.000   
389330                     Carotene, alpha     Vitamins      mcg     2.000   
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
389332                           Vitamin D     Vitamins       IU     0.000   
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
389335                            Lycopene     Vitamins      mcg     0.000   
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
389338                             Thiamin     Vitamins       mg     0.020   
389339                          Riboflavin     Vitamins       mg     0.060   
389340                              Niacin     Vitamins       mg     0.540   
389341                         Vitamin B-6     Vitamins       mg     0.260   
389342                       Folate, total     Vitamins      mcg    17.000   
389343                        Vitamin B-12     Vitamins      mcg     0.000   
389344                      Choline, total     Vitamins       mg     4.100   
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
389346                          Folic acid     Vitamins      mcg     0.000   
389347                        Folate, food     Vitamins      mcg    17.000   
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000   
389349                    Vitamin E, added     Vitamins       mg     0.000   
389350                 Vitamin B-12, added     Vitamins      mcg     0.000   
389351                         Cholesterol        Other       mg     0.000   
389352        Fatty acids, total saturated        Other        g     0.072   
389353  Fatty acids, total monounsaturated        Other        g     0.028   
389354  Fatty acids, total polyunsaturated        Other        g     0.041   

           id  
0        1008  
1        1008  
2        1008  
3        1008  
4        1008  
5        1008  
6        1008  
7        1008  
8        1008  
9        1008  
10       1008  
11       1008  
12       1008  
13       1008  
14       1008  
15       1008  
16       1008  
17       1008  
18       1008  
19       1008  
20       1008  
21       1008  
22       1008  
23       1008  
24       1008  
25       1008  
26       1008  
27       1008  
28       1008  
29       1008  
...       ...  
389325  43546  
389326  43546  
389327  43546  
389328  43546  
389329  43546  
389330  43546  
389331  43546  
389332  43546  
389333  43546  
389334  43546  
389335  43546  
389336  43546  
389337  43546  
389338  43546  
389339  43546  
389340  43546  
389341  43546  
389342  43546  
389343  43546  
389344  43546  
389345  43546  
389346  43546  
389347  43546  
389348  43546  
389349  43546  
389350  43546  
389351  43546  
389352  43546  
389353  43546  
389354  43546  

[389355 rows x 5 columns]
# 数据中有重复的数据
nutrients.duplicated().sum()
14179
# 去重
nutrients = nutrients.drop_duplicates()
col_mapping = {'description': 'food',
               'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info)
                                                   food  \
0                                       Cheese, caraway   
1                                       Cheese, cheddar   
2                                          Cheese, edam   
3                                          Cheese, feta   
4                    Cheese, mozzarella, part skim milk   
5      Cheese, mozzarella, part skim milk, low moisture   
6                                        Cheese, romano   
7                                     Cheese, roquefort   
8     Cheese spread, pasteurized process, american, ...   
9                           Cream, fluid, half and half   
10    Sour dressing, non-butterfat, cultured, filled...   
11    Milk, filled, fluid, with blend of hydrogenate...   
12    Cream substitute, liquid, with lauric acid oil...   
13                           Cream substitute, powdered   
14                  Milk, producer, fluid, 3.7% milkfat   
15    Milk, reduced fat, fluid, 2% milkfat, with add...   
16    Milk, reduced fat, fluid, 2% milkfat, with add...   
17    Milk, reduced fat, fluid, 2% milkfat, protein ...   
18    Milk, lowfat, fluid, 1% milkfat, with added vi...   
19    Milk, lowfat, fluid, 1% milkfat, with added no...   
20    Milk, lowfat, fluid, 1% milkfat, protein forti...   
21    Milk, nonfat, fluid, with added vitamin A and ...   
22    Milk, nonfat, fluid, with added nonfat milk so...   
23    Milk, nonfat, fluid, protein fortified, with a...   
24            Milk, buttermilk, fluid, cultured, lowfat   
25                              Milk, low sodium, fluid   
26               Milk, dry, whole, with added vitamin D   
27    Milk, dry, nonfat, regular, without added vita...   
28    Milk, dry, nonfat, instant, with added vitamin...   
29                   Milk, dry, nonfat, calcium reduced   
...                                                 ...   
6606  Beef, tenderloin, steak, separable lean only, ...   
6607  Beef, top sirloin, steak, separable lean only,...   
6608  Beef, short loin, top loin, steak, separable l...   
6609  Beef, chuck, arm pot roast, separable lean onl...   
6610  Beef, brisket, flat half, separable lean only,...   
6611  Beef, chuck, arm pot roast, separable lean onl...   
6612  Beef, brisket, flat half, separable lean only,...   
6613  Beef, round, eye of round, roast, separable le...   
6614  Beef, round, top round, steak, separable lean ...   
6615  Beef, round, bottom round, roast, separable le...   
6616  Beef, rib, small end (ribs 10-12), separable l...   
6617  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...   
6618  CAMPBELL Soup Company, CAMPBELL's Red and Whit...   
6619  CAMPBELL Soup Company, CAMPBELL'S SELECT Soups...   
6620  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...   
6621  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...   
6622  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6623  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6624  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6625  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...   
6626  CAMPBELL Soup Company, V8 Vegetable Juice, Ess...   
6627  CAMPBELL Soup Company, V8 Vegetable Juice, Spi...   
6628  CAMPBELL Soup Company, PACE, Jalapenos Nacho S...   
6629  CAMPBELL Soup Company, V8 60% Vegetable Juice,...   
6630  CAMPBELL Soup Company, V8 Vegetable Juice, Low...   
6631                             Bologna, beef, low fat   
6632  Turkey and pork sausage, fresh, bulk, patty or...   
6633                              Babyfood, juice, pear   
6634         Babyfood, dessert, banana yogurt, strained   
6635              Babyfood, banana no tapioca, strained   

                                 fgroup     id       manufacturer  
0                Dairy and Egg Products   1008                     
1                Dairy and Egg Products   1009                     
2                Dairy and Egg Products   1018                     
3                Dairy and Egg Products   1019                     
4                Dairy and Egg Products   1028                     
5                Dairy and Egg Products   1029                     
6                Dairy and Egg Products   1038                     
7                Dairy and Egg Products   1039                     
8                Dairy and Egg Products   1048                     
9                Dairy and Egg Products   1049                     
10               Dairy and Egg Products   1058                     
11               Dairy and Egg Products   1059                     
12               Dairy and Egg Products   1068                     
13               Dairy and Egg Products   1069                     
14               Dairy and Egg Products   1078                     
15               Dairy and Egg Products   1079               None  
16               Dairy and Egg Products   1080                     
17               Dairy and Egg Products   1081                     
18               Dairy and Egg Products   1082                     
19               Dairy and Egg Products   1083                     
20               Dairy and Egg Products   1084                     
21               Dairy and Egg Products   1085                     
22               Dairy and Egg Products   1086                     
23               Dairy and Egg Products   1087                     
24               Dairy and Egg Products   1088                     
25               Dairy and Egg Products   1089                     
26               Dairy and Egg Products   1090                     
27               Dairy and Egg Products   1091                     
28               Dairy and Egg Products   1092                     
29               Dairy and Egg Products   1093                     
...                                 ...    ...                ...  
6606                      Beef Products  23628                     
6607                      Beef Products  23629                     
6608                      Beef Products  23630                     
6609                      Beef Products  23631                     
6610                      Beef Products  23632                     
6611                      Beef Products  23633                     
6612                      Beef Products  23634                     
6613                      Beef Products  23635                     
6614                      Beef Products  23636                     
6615                      Beef Products  23637                     
6616                      Beef Products  23638                     
6617         Soups, Sauces, and Gravies  27015  Campbell Soup Co.  
6618         Soups, Sauces, and Gravies  27016  Campbell Soup Co.  
6619         Soups, Sauces, and Gravies  27021  Campbell Soup Co.  
6620         Soups, Sauces, and Gravies  27022  Campbell Soup Co.  
6621         Soups, Sauces, and Gravies  27023  Campbell Soup Co.  
6622         Soups, Sauces, and Gravies  27024  Campbell Soup Co.  
6623         Soups, Sauces, and Gravies  27025  Campbell Soup Co.  
6624         Soups, Sauces, and Gravies  27026  Campbell Soup Co.  
6625         Soups, Sauces, and Gravies  27032  Campbell Soup Co.  
6626  Vegetables and Vegetable Products  31010  Campbell Soup Co.  
6627  Vegetables and Vegetable Products  31013  Campbell Soup Co.  
6628  Vegetables and Vegetable Products  31014  Campbell Soup Co.  
6629  Vegetables and Vegetable Products  31016  Campbell Soup Co.  
6630  Vegetables and Vegetable Products  31017  Campbell Soup Co.  
6631        Sausages and Luncheon Meats  42161                     
6632        Sausages and Luncheon Meats  42173                     
6633                         Baby Foods  43408               None  
6634                         Baby Foods  43539               None  
6635                         Baby Foods  43546               None  

[6636 rows x 4 columns]
col_mapping = {'description': 'nutrient',
               'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients)
                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
389325                        Selenium, Se     Elements      mcg     1.100   
389326                       Vitamin A, IU     Vitamins       IU     5.000   
389327                             Retinol     Vitamins      mcg     0.000   
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
389329                      Carotene, beta     Vitamins      mcg     2.000   
389330                     Carotene, alpha     Vitamins      mcg     2.000   
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
389332                           Vitamin D     Vitamins       IU     0.000   
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
389335                            Lycopene     Vitamins      mcg     0.000   
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
389338                             Thiamin     Vitamins       mg     0.020   
389339                          Riboflavin     Vitamins       mg     0.060   
389340                              Niacin     Vitamins       mg     0.540   
389341                         Vitamin B-6     Vitamins       mg     0.260   
389342                       Folate, total     Vitamins      mcg    17.000   
389343                        Vitamin B-12     Vitamins      mcg     0.000   
389344                      Choline, total     Vitamins       mg     4.100   
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
389346                          Folic acid     Vitamins      mcg     0.000   
389347                        Folate, food     Vitamins      mcg    17.000   
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000   
389349                    Vitamin E, added     Vitamins       mg     0.000   
389350                 Vitamin B-12, added     Vitamins      mcg     0.000   
389351                         Cholesterol        Other       mg     0.000   
389352        Fatty acids, total saturated        Other        g     0.072   
389353  Fatty acids, total monounsaturated        Other        g     0.028   
389354  Fatty acids, total polyunsaturated        Other        g     0.041   

           id  
0        1008  
1        1008  
2        1008  
3        1008  
4        1008  
5        1008  
6        1008  
7        1008  
8        1008  
9        1008  
10       1008  
11       1008  
12       1008  
13       1008  
14       1008  
15       1008  
16       1008  
17       1008  
18       1008  
19       1008  
20       1008  
21       1008  
22       1008  
23       1008  
24       1008  
25       1008  
26       1008  
27       1008  
28       1008  
29       1008  
...       ...  
389325  43546  
389326  43546  
389327  43546  
389328  43546  
389329  43546  
389330  43546  
389331  43546  
389332  43546  
389333  43546  
389334  43546  
389335  43546  
389336  43546  
389337  43546  
389338  43546  
389339  43546  
389340  43546  
389341  43546  
389342  43546  
389343  43546  
389344  43546  
389345  43546  
389346  43546  
389347  43546  
389348  43546  
389349  43546  
389350  43546  
389351  43546  
389352  43546  
389353  43546  
389354  43546  

[375176 rows x 5 columns]
ndata = pd.merge(nutrients, info, on='id', how='outer')
print(ndata)
                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
375146                        Selenium, Se     Elements      mcg     1.100   
375147                       Vitamin A, IU     Vitamins       IU     5.000   
375148                             Retinol     Vitamins      mcg     0.000   
375149                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
375150                      Carotene, beta     Vitamins      mcg     2.000   
375151                     Carotene, alpha     Vitamins      mcg     2.000   
375152        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
375153                           Vitamin D     Vitamins       IU     0.000   
375154                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
375155                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
375156                            Lycopene     Vitamins      mcg     0.000   
375157                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
375158      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
375159                             Thiamin     Vitamins       mg     0.020   
375160                          Riboflavin     Vitamins       mg     0.060   
375161                              Niacin     Vitamins       mg     0.540   
375162                         Vitamin B-6     Vitamins       mg     0.260   
375163                       Folate, total     Vitamins      mcg    17.000   
375164                        Vitamin B-12     Vitamins      mcg     0.000   
375165                      Choline, total     Vitamins       mg     4.100   
375166           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
375167                          Folic acid     Vitamins      mcg     0.000   
375168                        Folate, food     Vitamins      mcg    17.000   
375169                         Folate, DFE     Vitamins  mcg_DFE    17.000   
375170                    Vitamin E, added     Vitamins       mg     0.000   
375171                 Vitamin B-12, added     Vitamins      mcg     0.000   
375172                         Cholesterol        Other       mg     0.000   
375173        Fatty acids, total saturated        Other        g     0.072   
375174  Fatty acids, total monounsaturated        Other        g     0.028   
375175  Fatty acids, total polyunsaturated        Other        g     0.041   

           id                                   food                  fgroup  \
0        1008                        Cheese, caraway  Dairy and Egg Products   
1        1008                        Cheese, caraway  Dairy and Egg Products   
2        1008                        Cheese, caraway  Dairy and Egg Products   
3        1008                        Cheese, caraway  Dairy and Egg Products   
4        1008                        Cheese, caraway  Dairy and Egg Products   
5        1008                        Cheese, caraway  Dairy and Egg Products   
6        1008                        Cheese, caraway  Dairy and Egg Products   
7        1008                        Cheese, caraway  Dairy and Egg Products   
8        1008                        Cheese, caraway  Dairy and Egg Products   
9        1008                        Cheese, caraway  Dairy and Egg Products   
10       1008                        Cheese, caraway  Dairy and Egg Products   
11       1008                        Cheese, caraway  Dairy and Egg Products   
12       1008                        Cheese, caraway  Dairy and Egg Products   
13       1008                        Cheese, caraway  Dairy and Egg Products   
14       1008                        Cheese, caraway  Dairy and Egg Products   
15       1008                        Cheese, caraway  Dairy and Egg Products   
16       1008                        Cheese, caraway  Dairy and Egg Products   
17       1008                        Cheese, caraway  Dairy and Egg Products   
18       1008                        Cheese, caraway  Dairy and Egg Products   
19       1008                        Cheese, caraway  Dairy and Egg Products   
20       1008                        Cheese, caraway  Dairy and Egg Products   
21       1008                        Cheese, caraway  Dairy and Egg Products   
22       1008                        Cheese, caraway  Dairy and Egg Products   
23       1008                        Cheese, caraway  Dairy and Egg Products   
24       1008                        Cheese, caraway  Dairy and Egg Products   
25       1008                        Cheese, caraway  Dairy and Egg Products   
26       1008                        Cheese, caraway  Dairy and Egg Products   
27       1008                        Cheese, caraway  Dairy and Egg Products   
28       1008                        Cheese, caraway  Dairy and Egg Products   
29       1008                        Cheese, caraway  Dairy and Egg Products   
...       ...                                    ...                     ...   
375146  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375147  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375148  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375149  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375150  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375151  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375152  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375153  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375154  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375155  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375156  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375157  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375158  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375159  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375160  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375161  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375162  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375163  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375164  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375165  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375166  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375167  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375168  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375169  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375170  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375171  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375172  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375173  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375174  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375175  43546  Babyfood, banana no tapioca, strained              Baby Foods   

       manufacturer  
0                    
1                    
2                    
3                    
4                    
5                    
6                    
7                    
8                    
9                    
10                   
11                   
12                   
13                   
14                   
15                   
16                   
17                   
18                   
19                   
20                   
21                   
22                   
23                   
24                   
25                   
26                   
27                   
28                   
29                   
...             ...  
375146         None  
375147         None  
375148         None  
375149         None  
375150         None  
375151         None  
375152         None  
375153         None  
375154         None  
375155         None  
375156         None  
375157         None  
375158         None  
375159         None  
375160         None  
375161         None  
375162         None  
375163         None  
375164         None  
375165         None  
375166         None  
375167         None  
375168         None  
375169         None  
375170         None  
375171         None  
375172         None  
375173         None  
375174         None  
375175         None  

[375176 rows x 8 columns]
ndata.loc[30000]
nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object
result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
print(result)
nutrient          fgroup                           
Adjusted Protein  Sweets                               12.9000
                  Vegetables and Vegetable Products     2.1800
Alanine           Baby Foods                            0.0850
                  Baked Products                        0.2480
                  Beef Products                         1.5500
                  Beverages                             0.0030
                  Breakfast Cereals                     0.3110
                  Cereal Grains and Pasta               0.3730
                  Dairy and Egg Products                0.2710
                  Ethnic Foods                          1.2900
                  Fast Foods                            0.5140
                  Fats and Oils                         0.0000
                  Finfish and Shellfish Products        1.2180
                  Fruits and Fruit Juices               0.0270
                  Lamb, Veal, and Game Products         1.4080
                  Legumes and Legume Products           0.4100
                  Meals, Entrees, and Sidedishes        0.3270
                  Nut and Seed Products                 0.7345
                  Pork Products                         1.3070
                  Poultry Products                      1.3940
                  Restaurant Foods                      0.4650
                  Sausages and Luncheon Meats           0.9420
                  Snacks                                0.4335
                  Soups, Sauces, and Gravies            0.0650
                  Spices and Herbs                      0.5550
                  Sweets                                0.1020
                  Vegetables and Vegetable Products     0.0840
Alcohol, ethyl    Baby Foods                            0.0000
                  Baked Products                        0.0000
                  Beef Products                         0.0000
                                                        ...   
Water             Snacks                                3.5200
                  Soups, Sauces, and Gravies           85.9000
                  Spices and Herbs                     43.6700
                  Sweets                                9.0500
                  Vegetables and Vegetable Products    89.1950
Zinc, Zn          Baby Foods                            0.5900
                  Baked Products                        0.6600
                  Beef Products                         5.3900
                  Beverages                             0.0400
                  Breakfast Cereals                     2.8850
                  Cereal Grains and Pasta               1.0900
                  Dairy and Egg Products                1.3900
                  Ethnic Foods                          1.0450
                  Fast Foods                            1.2500
                  Fats and Oils                         0.0200
                  Finfish and Shellfish Products        0.6700
                  Fruits and Fruit Juices               0.1000
                  Lamb, Veal, and Game Products         3.9400
                  Legumes and Legume Products           1.1400
                  Meals, Entrees, and Sidedishes        0.6300
                  Nut and Seed Products                 3.2900
                  Pork Products                         2.3200
                  Poultry Products                      2.5000
                  Restaurant Foods                      0.8000
                  Sausages and Luncheon Meats           2.1300
                  Snacks                                1.4700
                  Soups, Sauces, and Gravies            0.2000
                  Spices and Herbs                      2.7500
                  Sweets                                0.3600
                  Vegetables and Vegetable Products     0.3300
Name: value, Length: 2246, dtype: float64
# result['Zinc, Zn'].order().plot(kind='barh') #AttributeError: 'Series' object has no attribute 'order'
result['Zinc, Zn'].sort_values().plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x2156ce3c748>
by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
print(max_foods)
                                                 value  \
nutgroup    nutrient                                     
Amino Acids Alanine                              8.009   
            Arginine                             7.436   
            Aspartic acid                       10.203   
            Cystine                              1.307   
            Glutamic acid                       17.452   
            Glycine                             19.049   
            Histidine                            2.999   
            Hydroxyproline                       0.803   
            Isoleucine                           4.300   
            Leucine                              7.200   
            Lysine                               6.690   
            Methionine                           1.859   
            Phenylalanine                        4.600   
            Proline                             12.295   
            Serine                               4.600   
            Threonine                            3.300   
            Tryptophan                           1.600   
            Tyrosine                             3.300   
            Valine                               4.500   
Composition Adjusted Protein                    12.900   
            Carbohydrate, by difference        100.000   
            Fiber, total dietary                79.000   
            Protein                             88.320   
            Sugars, total                       99.800   
            Total lipid (fat)                  100.000   
            Water                              100.000   
Elements    Calcium, Ca                       7364.000   
            Copper, Cu                          15.050   
            Fluoride, F                        584.000   
            Iron, Fe                            87.470   
...                                                ...   
Vitamins    Cryptoxanthin, beta               6186.000   
            Dihydrophylloquinone               103.800   
            Folate, DFE                       2630.000   
            Folate, food                      2340.000   
            Folate, total                     2340.000   
            Folic acid                        1538.000   
            Lutein + zeaxanthin              39550.000   
            Lycopene                         46260.000   
            Menaquinone-4                       33.200   
            Niacin                              97.000   
            Pantothenic acid                    35.000   
            Retinol                          30000.000   
            Riboflavin                          14.300   
            Thiamin                             20.000   
            Tocopherol, beta                     6.490   
            Tocopherol, delta                   30.880   
            Tocopherol, gamma                  100.880   
            Vitamin A, IU                   100000.000   
            Vitamin A, RAE                   30000.000   
            Vitamin B-12                        98.890   
            Vitamin B-12, added                 24.000   
            Vitamin B-6                         12.000   
            Vitamin C, total ascorbic acid    2400.000   
            Vitamin D                        10000.000   
            Vitamin D (D2 + D3)                250.000   
            Vitamin D2 (ergocalciferol)         28.100   
            Vitamin D3 (cholecalciferol)        27.400   
            Vitamin E (alpha-tocopherol)       149.400   
            Vitamin E, added                    46.550   
            Vitamin K (phylloquinone)         1714.500   

                                                                                         food  
nutgroup    nutrient                                                                           
Amino Acids Alanine                                         Gelatins, dry powder, unsweetened  
            Arginine                                             Seeds, sesame flour, low-fat  
            Aspartic acid                                                 Soy protein isolate  
            Cystine                              Seeds, cottonseed flour, low fat (glandless)  
            Glutamic acid                                                 Soy protein isolate  
            Glycine                                         Gelatins, dry powder, unsweetened  
            Histidine                              Whale, beluga, meat, dried (Alaska Native)  
            Hydroxyproline                  KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...  
            Isoleucine                      Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Leucine                         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Lysine                          Seal, bearded (Oogruk), meat, dried (Alaska Na...  
            Methionine                                  Fish, cod, Atlantic, dried and salted  
            Phenylalanine                   Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Proline                                         Gelatins, dry powder, unsweetened  
            Serine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Threonine                       Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Tryptophan                       Sea lion, Steller, meat with fat (Alaska Native)  
            Tyrosine                        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Valine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
Composition Adjusted Protein                           Baking chocolate, unsweetened, squares  
            Carbohydrate, by difference           Sweeteners, tabletop, fructose, dry, powder  
            Fiber, total dietary                                             Corn bran, crude  
            Protein                         Soy protein isolate, potassium type, crude pro...  
            Sugars, total                                                  Sugars, granulated  
            Total lipid (fat)                                                 Oil, wheat germ  
            Water                                               Water, bottled, POLAND SPRING  
Elements    Calcium, Ca                     Leavening agents, baking powder, double-acting...  
            Copper, Cu                      Veal, variety meats and by-products, liver, co...  
            Fluoride, F                     Tea, instant, sweetened with sugar, lemon-flav...  
            Iron, Fe                            Salad dressing, russian dressing, low calorie  
...                                                                                       ...  
Vitamins    Cryptoxanthin, beta                                               Spices, paprika  
            Dihydrophylloquinone            Margarine, 80% fat, stick, includes regular an...  
            Folate, DFE                            Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  
            Folate, food                         Leavening agents, yeast, baker's, active dry  
            Folate, total                        Leavening agents, yeast, baker's, active dry  
            Folic acid                             Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  
            Lutein + zeaxanthin                                                     Kale, raw  
            Lycopene                                                            Tomato powder  
            Menaquinone-4                   Chicken, broilers or fryers, drumstick, meat a...  
            Niacin                                                       Yeast extract spread  
            Pantothenic acid                Cereals ready-to-eat, KELLOGG, KELLOGG'S Compl...  
            Retinol                                                       Fish oil, cod liver  
            Riboflavin                                                   Yeast extract spread  
            Thiamin                         MORNINGSTAR FARMS Hot and Spicy Veggie Sausage...  
            Tocopherol, beta                Yellow pond lily, Wocas, dried seeds (Pacific ...  
            Tocopherol, delta                 Oil, cooking and salad, ENOVA, 80% diglycerides  
            Tocopherol, gamma                 Oil, cooking and salad, ENOVA, 80% diglycerides  
            Vitamin A, IU                                                 Fish oil, cod liver  
            Vitamin A, RAE                                                Fish oil, cod liver  
            Vitamin B-12                    Mollusks, clam, mixed species, cooked, moist heat  
            Vitamin B-12, added             Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  
            Vitamin B-6                     Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  
            Vitamin C, total ascorbic acid  Orange-flavor drink, breakfast type, low calor...  
            Vitamin D                                                     Fish oil, cod liver  
            Vitamin D (D2 + D3)                                           Fish oil, cod liver  
            Vitamin D2 (ergocalciferol)                               Mushrooms, maitake, raw  
            Vitamin D3 (cholecalciferol)                        Fish, halibut, Greenland, raw  
            Vitamin E (alpha-tocopherol)                                      Oil, wheat germ  
            Vitamin E, added                Cereals ready-to-eat, GENERAL MILLS, Multi-Gra...  
            Vitamin K (phylloquinone)                                    Spices, sage, ground  

[94 rows x 2 columns]
max_foods.loc['Amino Acids']['food']
nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

参考资料:炼数成金Python数据分析课程

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值