kaggle 如何利用API下载数据集

artificiali

已于 2025-06-19 15:14:52 修改

阅读量689

点赞数 3

CC 4.0 BY-SA版权

分类专栏： kaggle比賽文章标签： python 人工智能机器学习

于 2024-11-12 20:08:10 首次发布

本文链接：https://blog.youkuaiyun.com/artificiali/article/details/143722741

kaggle比賽专栏收录该内容

5 篇文章

订阅专栏

your_project/
├── kaggle/ # Kaggle 专用目录
│ ├── input/ # 数据集（自动映射到 Kaggle Kernels 的 /kaggle/input）
│ └── working/ # 临时输出（映射到 Kaggle Kernels 的 /kaggle/working）
├── src/ # 核心代码存放位置（重点！）
│ ├── utils/ # 工具函数
│ ├── models/ # 模型定义
│ ├── config.py # 路径配置
│ └── main.py # 主程序
├── notebooks/ # Jupyter Notebook 文件
├── requirements.txt # 依赖库列表
└── README.md # 项目说明

第一步API 密钥：

首先上传kaggle官网生成得 API 密钥： kaggle.json 文件。放到该代码同目录下，再运行一下代码。

'''
首先上传 kaggle.json 文件并设置 API 密钥
再运行此代码
'''
import os
import shutil

# 创建目标目录（如果不存在）
kaggle_dir = os.path.expanduser('~/.kaggle')
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)

# 检查当前目录下是否存在kaggle.json
src_file = 'kaggle.json'
if os.path.exists(src_file):
    # 移动文件并设置权限
    dst_file = os.path.join(kaggle_dir, 'kaggle.json')
    shutil.move(src_file, dst_file)
    os.chmod(dst_file, 0o600)  # Kaggle要求权限为600
    print("文件已成功移动并设置权限！")
else:
    print(f"错误：当前目录下未找到 {src_file}，请确保文件存在。")

拉取笔记本

!pip install kaggle
try: 
    import kaggle
except: 
    !pip install kaggle
!kaggle kernels pull brendanartley/hgnet-v2-starter

注：只需要修改‘brendanartley/hgnet-v2-starter’，每个笔记本有api可以直接复制。

拉取数据

注：

如果使用 kagglehub 下载，默认下载目录是你的主目录下的 ~/.cache/kagglehub/,必须修改
import os
os.environ["KAGGLEHUB_CACHE"] = "/usropt2190"#修改路径
#export KAGGLEHUB_CACHE=/your/custom/path

import kagglehub
import time

for i in range(3):
    try:
        kagglehub.dataset_download("egortrushin/open-wfi-test")
        break
    except Exception as e:
        print(f"下载失败，第{i+1}次重试，错误：{e}")
        time.sleep(5)
只需要修改'''下载竞赛数据集'''，就可以选择你的指定数据集。

jupyter文件运行

# 创建目录
mkdir -p your_project/kaggle/input

# 下载数据集到指定路径
kaggle datasets download egortrushin/open-wfi-test -p your_project/kaggle/input


import os
import zipfile

# 定义源ZIP文件和目标解压目录的路径
zip_file_path = os.path.join('archive.zip')
extract_to_dir = os.path.join('kaggle', 'input', 'openfwi-preprocessed-72x72')

# 确保目标目录存在，如果不存在则创建
os.makedirs(extract_to_dir, exist_ok=True)

# 解压ZIP文件
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

print(f"ZIP文件已成功解压到 {extract_to_dir}")

总代码

'''
首先上传 kaggle.json 文件并设置 API 密钥
再运行此代码
'''
import os
import shutil

# 创建目标目录（如果不存在）
kaggle_dir = os.path.expanduser('~/.kaggle')
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)

# 检查当前目录下是否存在kaggle.json
src_file = 'kaggle.json'
if os.path.exists(src_file):
    # 移动文件并设置权限
    dst_file = os.path.join(kaggle_dir, 'kaggle.json')
    shutil.move(src_file, dst_file)
    os.chmod(dst_file, 0o600)  # Kaggle要求权限为600
    print("文件已成功移动并设置权限！")
else:
    print(f"错误：当前目录下未找到 {src_file}，请确保文件存在。")

# 下载竞赛数据 -p 下载地址
!kaggle competitions download -c child-mind-institute-problematic-internet-use -p ./data

# 检查下载的文件
import os
print(os.listdir('./data'))

# 解压下载的 ZIP 文件
import zipfile

with zipfile.ZipFile('./data/child-mind-institute-problematic-internet-use.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')

# 加载数据
import pandas as pd

# 假设下载的是 train.csv 文件
data = pd.read_csv('./data/train.csv')

# 查看数据的前几行
data.head()

环境测测

输出路径

import os
notebook_name = os.getenv("JPY_NOTEBOOK_NAME", "")
notebook_path = os.path.join(os.getcwd(), notebook_name)
print("当前 Notebook 的路径:", notebook_path)

查看空间：df -h

输出显卡图形

 nvidia-smi

# 在Python环境中执行（适用于TensorFlow/PyTorch）
import torch, gc
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

wandb

try: 
    import wandb
except: 
    !pip install --no-deps wandb -q

!wandb login apixxxxxxxxxxx

benchmark.py

import torch

print('Pytorch version\t:', torch.__version__)
print('CUDA version\t:', torch.version.cuda)
print('GPU\t\t:',torch.cuda.get_device_name())
import inspect
from collections import defaultdict
import pandas as pd
from torch.utils import benchmark

pd.options.display.precision = 3

def var_dict(*args):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return dict([(name, val) for name, val in callers_local_vars if val is arg][0]
                for arg in args)

def walltime(stmt, arg_dict, duration=3):
    return benchmark.Timer(stmt=stmt, globals=arg_dict).blocked_autorange(
        min_run_time=duration).median
        
matmul_tflops = defaultdict(lambda: {})
for n in [128, 512, 2048, 8192,8192*2]:
    for dtype in (torch.float32, torch.float16):
        a = torch.randn(n, n, dtype=dtype).cuda()
        b = torch.randn(n, n, dtype=dtype).cuda()
        t = walltime('a @ b', var_dict(a, b))
        matmul_tflops[f'n={n}'][dtype] = 2*n**3 / t / 1e12
        del a, b

print(pd.DataFrame(matmul_tflops))

hug镜像加速

import os

# 设置 Hugging Face 镜像源（中国大陆用户）
#export HF_ENDPOINT=https://hf-mirror.com
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# 安装高速传输库
!pip install hf-transfer
# 设置环境变量启用加速
# export HF_HUB_ENABLE_HF_TRANSFER=1
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'