文章目录
1、离散化
1.1、等宽离散化
1.1.1、pandas实现
pandas.cut
(x, bins, right=True, labels=None)
import pandas as pd
df = pd.DataFrame({
'Name': ['car', 'gun', 'cat'],
'Age': [5, 2.5, 0.5]})
bins = list(range(0, 10, 2)) # 划分区间
labels = ['(%d,%d]' % (bins[i], bins[i+1]) for i in range(len(bins)-1)] # 自定义标签:长度要-1
# 数据离散化
s1 = pd.cut(df['Age'], bins) # [(0, 2] < (2, 4] < (4, 6] < (6, 8]]
s2 = pd.cut(df['Age'], bins, False) # [[0, 2) < [2, 4) < [4, 6) < [6, 8)]
s3 = pd.cut(x=df['Age'], bins=bins, right=False, labels=labels)
print(s1, s2, s3, sep='\n\n')
参数 | 解析 |
---|---|
x | 被分组的数据 |
bins | 分组方式 |
right | True(default):左开右闭 False:左闭右开 |
labels | 自定义分组的标签 |
1.1.2、手写实现
x=[.02,.03,.05,.91,.05,.06,.01,.06,.39,.02,.13,.19,.27,.95,.02,.06,.13,.82,.89,.34,.2,.95,.02,.13,.96,.03,.05,.23,.24,
.96,.98,.03,.98,.08,.15,.2,.93,.09,.29,.97,.03,.03,.0,.95,.33,.05,.06,.86,.07,.21,.44,.67,.5,.04,.33,.05,.08,.02,.46,.49
,.68,.71,.03,.36,.38,.41,.71,.02,.98,.0,.42,.01,.16,.01,.09,.03,.36,.17,.03,.34,.94,.07,.42,.89,.04,.87,.05,.45,.9,.02,
.03,.04,.08,.81,.87,.11,.95,.06,.26,.06,.87,.04,.04,.09,.09,.91,.95,.06,.07,.09,.95,.98,.1,.84,.03,.14,.07,.03,.17,.88,
.03,.64,.03,.98,.05,.94,.02,.23,.87,.11,.83,.91,.06,.98,.04,.21,.29,.24,.83,.11,.27,.11,.41,.87,.01,.02,.17,.88,.03,.08,
.32,.11,.96,.03,.13,.18,.27,.94,.03,.18,.1,.39,.07,.27,.93,.07,.1,.82,.05,.25,.07,.04,.79,.8,.05,.06,.05,.57,.61,.63,
.04,.02,.54,.58,.62,.04,.78,.79,.01,.71,.07,.03,.02,.67,.03,.0,.02,.04,.05,.04,.0,.2,.85,.03,.13,.93,.74,.12,.9,.04,.05,
.3,.1,.93,.04,.99,.06,.05,.88,.92,.02,.16,.94,.03,.03,.18,.22,.26,.9,.02,.05,.45,.03,.03,.98,.04,.06,.91,.04,.04,.03,.5,
.89,.26,.84,.05,.12,.19,.2,.93,.06,.11,.44,.88,.9,.2,.9,.06,.95,.13,.54,.96,.01,.09,.04,.05,.05,.07,.07,.08,.03,.02,.02,
.1,.02,.27,.29,.92,.03,.24,.28,.91,.09,.64,.95,.03,.95,.02,.04,.02,.21,.15,.29,.59,.03,.35,.62,.04,.27,.31,.85,.86,.08,
.02,.99,.01,.01,.86,.04,.83,.03,.99,.01,.02,.4,.59,.64,.66,.56,.98,.05,.58,.89,.03,.96,.98,.02,.97,.01,.03,.68,.94,.05,
.99,.04,.79,.8,.22,.0,.65,.04,.11,.95,.04,.99,.46,.99,.04,.22,.48,.59,.03,.06,.08,.02,.19,.83,.04,.02,.15,.94,1.0,.05]
def discretize(x, n=20):
bins = ['[%.2f,%.2f)' % (i / n, i / n + 1 / n) for i in range(n)]
dt = {i: 0 for i in bins}
for i in x:
j = int(i // (1 / n))
dt[bins[j]] += 1
return dt
dt = discretize(x, n=20)
for i, j in sorted(dt.items(), reverse=True):
print(i, '#' * j, j)
import matplotlib.pyplot as mp, seaborn
seaborn.swarmplot(y=x)
mp.show()
1.2、等频离散化
import pandas as pd
s0 = pd.Series([1, 1, 1, 2, 2, 2, 5, 5, 5, 9, 9, 9])
q = 4 # 划分区间数
s1 = pd.qcut(s0, q)
print(s1)
"""
0 (0.999, 1.75]
1 (0.999, 1.75]
2 (0.999, 1.75]
3 (1.75, 3.5]
4 (1.75, 3.5]
5 (1.75, 3.5]
6 (3.5, 6.0]
7 (3.5, 6.0]
8 (3.5, 6.0]
9 (6.0, 9.0]
10 (6.0, 9.0]
11 (6.0, 9.0]
dtype: category
Categories (4, interval[float64]): [(0.999, 1.75] < (1.75, 3.5] < (3.5, 6.0] < (6.0, 9.0]]
"""
1.3、聚类
https://blog.youkuaiyun.com/Yellow_python/article/details/81461056
2、逆离散化
2.1、取 min 和 max
import pandas as pd
df = pd.DataFrame({
'name': ['cat', 'dog', 'tiger'],
'price_min': [100, 10, 200],
'price_max': [180, 90, 220]})
print(df)
# 删除列
row_min = df.pop('price_min')
row_max = df.pop('price_max')
# 重构
df_min = pd.concat([df, row_min], axis=1)
df_max = pd.concat([df, row_max], axis=1)
# 改列名
df_min.rename(columns={'price_min': 'price'}, inplace=True)
df_max.rename(columns={'price_max': 'price'}, inplace=True)
# 合并
df_mm = pd.concat([df_min, df_max])
print(df_mm)
2.2、取 min、max 和 mean
import pandas as pd
df = pd.DataFrame({
'name': ['cat', 'dog', 'tiger'],
'section': ['100~180', '10~90', '200~220']})
# 切分
values = df['section'].str.split('~')
# 最小值、最大值、均值
row_min = [float(i[0]) for i in values]
row_max = [float(i[1]) for i in values]
row_mean = [(row_min[i]+row_max[i])/2 for i in range(len(values))]
# 重构
df_min = pd.concat([df, pd.DataFrame(row_min, columns=['value'])], axis=1)
df_max = pd.concat([df, pd.DataFrame(row_max, columns=['value'])], axis=1)
df_mean = pd.concat([df, pd.DataFrame(row_mean, columns=['value'])], axis=1)
# 合并+排序
df_mmm = pd.concat([df_min, df_mean, df_max])
df_mmm.sort_values(by=['name', 'value'], axis=0, ascending=True, inplace=True)
print(df_mmm)
2.3、等距均匀升采样
import pandas as pd
df = pd.DataFrame({
'name': ['cat', 'dog', 'tiger'],
'section': ['100~190', '0~90', '200~230']})
# 切分
values = df['section'].str.split('~')
length = len(values) # 行数
# 最大值、最小值
row_min = [float(i[0]) for i in values]
row_max = [float(i[1]) for i in values]
# 升采样(以n=3为例)
n = 3
row_ls = [row_min, row_max]
for j in range(1, n):
row = [(row_min[i]+(row_max[i]-row_min[i])*j/n) for i in range(length)]
row_ls.append(row)
# 重构DataFrame
df_ls = [pd.concat([df, pd.DataFrame(row, columns=['value'])], axis=1) for row in row_ls]
# 合并DataFrame、升序、重构索引
df_mm = pd.concat(df_ls)
df_mm.sort_values(by=['name', 'value'], axis=0, ascending=True, inplace=True)
df_mm.reset_index(drop=True, inplace=True)
print(df_mm)
2.4、随机均匀升采样
import pandas as pd, numpy as np
df = pd.DataFrame({
'name': ['cat', 'tiger'],
'section': ['0~90', '180~200']})
# 切出最小值和最大值
min_max = df['section'].str.split('~')
# 升采样(以n=10为例)
n = 10
samples = [np.random.uniform(int(i[0]), int(i[1]), size=n) for i in min_max]
# 重构DataFrame
df_ls = [pd.concat([df, pd.DataFrame(row, columns=['value'])], axis=1) for row in zip(*samples)]
# 合并DataFrame、升序、重构索引
df_mm = pd.concat(df_ls)
df_mm.sort_values(by=['name', 'value'], axis=0, ascending=True, inplace=True)
df_mm.reset_index(drop=True, inplace=True)
print(df_mm)
【均匀分布】某次随机结果
2.5、随机非均匀升采样
场景:
招聘网站有不少【5~10k】的岗位
实际大部分人的offer是【5~6k】
甚至被HR压倒【小于5k】😠 💢
很少人能【达到10k】
因此实际分布是不均匀的
对此可用非均匀分布,例如:正态分布、三角型分布、泊松分布、卡方分布……
import pandas as pd, numpy as np
df = pd.DataFrame({
'name': ['cat', 'tiger'],
'section': ['5~10', '5~7']})
# 切出最小值和最大值
min_max = [(int(i0), int(i1)) for i0, i1 in df['section'].str.split('~')]
# 升采样
n = 15
samples = [np.square(np.random.uniform(size=n)) * (i1 - i0) + i0 for i0, i1 in min_max]
# 重构DataFrame
df_ls = [pd.concat([df, pd.DataFrame(row, columns=['value'])], axis=1) for row in zip(*samples)]
# 合并DataFrame、升序、重构索引
df_mm = pd.concat(df_ls)
df_mm.sort_values(by=['name', 'value'], axis=0, ascending=True, inplace=True)
df_mm.reset_index(drop=True, inplace=True)
print(df_mm)
【三角形分布】某次随机结果(碰巧试出三角形来)
import pandas as pd, numpy as np
df = pd.DataFrame({
'name': ['cat', 'tiger'],
'section': ['5~10', '5~7']})
# 切出最小值和最大值
min_max = df['section'].str.split('~')
# 升采样
n = 15 # 升采样总数
samples = [np.random.normal(max(
0., int(i[0]) * .8 + int(i[1]) * .2), max(
0., (int(i[1]) - int(i[0])) * .2), size=n) for i in min_max]
# 重构DataFrame
df_ls = [pd.concat([df, pd.DataFrame(row, columns=['value'])], axis=1) for row in zip(*samples)]
# 合并DataFrame、升序、重构索引
df_mm = pd.concat(df_ls)
df_mm.sort_values(by=['name', 'value'], axis=0, ascending=True, inplace=True)
df_mm.reset_index(drop=True, inplace=True)
print(df_mm)
【正态分布】某次随机结果