一、奶茶店订单数据集(奶茶订单.csv
)
场景:2 周奶茶店真实订单(含促销活动)
字段:日期、星期、商品、销量、客单价、优惠券(是否使用)、天气
import pandas as pd
import random
data = []
dates = pd.date_range(start="2024-03-01", end="2024-03-14")
products = ["珍珠奶茶", "芋泥波波", "杨枝甘露", "柠檬茶", "青稞奶茶"]
for date in dates:
for product in products:
# 模拟周末销量翻倍,雨天柠檬茶畅销
base_sales = 20 if date.weekday() < 5 else 40
if product == "柠檬茶" and date.day % 3 == 0: # 每3天下雨日促销
base_sales += 15
# 加入脏数据(10%概率出现负数或异常值)
sales = base_sales + random.randint(-10, 20) if random.random() > 0.1 else -5
data.append({
"日期": date.strftime("%Y-%m-%d"),
"星期": ["周一", "周二", "周三", "周四", "周五", "周六", "周日"][date.weekday()],
"商品": product,
"销量": max(sales, 0), # 确保销量非负
"客单价": 15 if product == "珍珠奶茶" else random.randint(12, 28),
"优惠券": random.choice([True, False]) if date.weekday() < 5 else True, # 周末强制用券
"天气": random.choice(["晴", "多云", "雨"]) if date.weekday() < 5 else "晴"
})
df = pd.DataFrame(data)
df.to_csv("奶茶订单.csv", index=False)
print("奶茶订单数据集已生成!含周末爆款、雨天促销、脏数据,该数据集对应0基础也能懂系列不想关注公众号的宝子。适合练习清洗与分析~")
运行上述该代码可以生成奶茶订单.csv数据集噢。
二、超市小票数据集(超市小票.csv
)
场景:家庭超市购物记录(含会员等级、促销商品)
字段:日期、商品名称、分类、价格、数量、会员等级(普通 / VIP)、是否促销
import pandas as pd
import random
from datetime import datetime, timedelta
# 定义可能的商品信息
products = [
{"name": "纯牛奶", "category": "生鲜", "price": 65},
{"name": "辣条", "category": "零食", "price": 5},
{"name": "洗衣液", "category": "日用品", "price": 39},
{"name": "冰淇淋", "category": "冷冻", "price": 18},
{"name": "大米", "category": "粮油", "price": 89},
{"name": "奥特曼玩具", "category": "母婴", "price": 129},
{"name": "酸奶", "category": "生鲜", "price": 45},
{"name": "卫生纸", "category": "日用品", "price": 29}
]
# 定义日期范围
start_date = datetime(2024, 3, 1)
end_date = datetime(2024, 3, 31)
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
# 生成数据
data = []
for date in date_list:
num_purchases = random.randint(1, 5) # 每天的购买次数
for _ in range(num_purchases):
product = random.choice(products)
quantity = random.randint(1, 5)
member_level = random.choice(["普通", "VIP"])
is_promotion = random.choice([True, False])
data.append({
"日期": date.strftime("%Y-%m-%d"),
"商品名称": product["name"],
"分类": product["category"],
"价格": product["price"],
"数量": quantity,
"会员等级": member_level,
"是否促销": is_promotion
})
# 创建DataFrame并保存为CSV文件
df = pd.DataFrame(data)
df.to_csv("超市小票.csv", index=False)
print("超市小票数据集已生成:超市小票.csv")
同理,运行上述代码可以生成超市小票.csv的数据集噢。
三、电商复购数据集(电商复购数据.csv
)
场景:3 个月美妆电商用户行为(含首次购买与复购)
字段:用户 ID、首次购买时间、首次商品、复购次数、最近复购时间、总消费金额
import pandas as pd
import random
from datetime import datetime, timedelta
# 定义用户数量
num_users = 100
# 定义日期范围
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 3, 31)
# 可能的商品列表
products = ["口红", "面膜", "粉底液", "卸妆油", "防晒霜"]
data = []
for user_id in range(1, num_users + 1):
# 首次购买时间
first_purchase_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
first_product = random.choice(products)
# 生成复购次数
repurchase_count = random.randint(0, 5)
total_consumption = 0
if repurchase_count > 0:
# 最近复购时间
last_repurchase_date = first_purchase_date
for _ in range(repurchase_count):
# 计算复购时间间隔
days_interval = random.randint(1, 30)
last_repurchase_date = last_repurchase_date + timedelta(days=days_interval)
if last_repurchase_date > end_date:
break
product = random.choice(products)
price = random.randint(100, 500)
total_consumption += price
else:
last_repurchase_date = None
price = random.randint(100, 500)
total_consumption = price
data.append({
"用户ID": f"U{user_id:03d}",
"首次购买时间": first_purchase_date.strftime("%Y-%m-%d"),
"首次商品": first_product,
"复购次数": repurchase_count,
"最近复购时间": last_repurchase_date.strftime("%Y-%m-%d") if last_repurchase_date else "-",
"总消费金额": total_consumption
})
# 创建DataFrame并保存为CSV文件
df = pd.DataFrame(data)
df.to_csv("电商复购数据.csv", index=False)
print("电商复购数据集已生成:电商复购数据.csv")
还是老话,运行上述代码可以生成电商复购数据.csv的数据集噢。
练习点:/NOTICE ! ! !
- 计算用户生命周期价值(LTV):总消费金额 / 复购次数
- 绘制复购周期分布:用最近复购时间 - 首次购买时间分析粘性
- 找出「沉睡用户」:首次购买后超 60 天未复购(如 U002)
四、数据集设计心机(新手友好型)
- 脏数据埋点:故意加入负数销量、异常日期,强制练习
dropna()
/replace()
- 业务逻辑:奶茶店周末销量高、超市促销影响购买决策,贴近真实场景
- 中文注释:所有字段名均为中文,避免
user_id
等英文干扰(如用户ID
而非user_id
) - 小而美:每个文件≤100 行,用
pd.read_csv()
秒开,避免电脑卡顿
使用建议:
- 用 Jupyter 打开数据集,先运行
df.head()
观察数据结构 - 尝试回答:「奶茶店下雨天哪款饮品卖得最好?」(需关联「天气」和「销量」)
- 进阶挑战:用超市数据预测「买了牛奶的用户是否会买面包」(关联规则分析)
(所有数据集已打包,请在这边文章底部领取噢!)