1.安装docker
(https://docs.docker.com/engine/install/ubuntu/)
做完上面帖子安装nvidia tookit
# 移除现有的Docker软件包及其依赖项:
sudo apt-get remove docker docker-engine docker.io containerd runc
# 安装依赖项并添加Docker官方GPG密钥:
sudo apt-get update
sudo apt-get install \
apt-transport-https \
ca-certificates \
curl \
gnupg \
lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
# 添加Docker官方软件包仓库:
echo \
"deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
# 更新软件包列表并安装Docker:
sudo apt-get update
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
# 重启docker
sudo systemctl restart docker
# 安装nvidia tookit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
2. 把docker挂在到磁盘空间较大的目录
2.1 改法1
/data/docker/
# 停止Docker运行
sudo service docker stop
# 挂载到有磁盘空间较大的文件夹下面
sudo dockerd --data-root /data/docker
2.2 改法2
# 查看路径
docker info | grep 'Docker Root Dir'
# 更改文件
sudo mkdir /data/docker
sudo vim /etc/docker/daemon.json
# 添加
{
"data-root": "/data/docker"
}
# 重启docker
sudo systemctl restart docker
3. 安装pytorch_的docker
官方镜像的地址: (https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags)
我在我的服务器挂载的磁盘上mkdir了一个文件夹pyt_env_1.12, 这个文件夹存放我挂载在pytorch容器的文件, 方便git管理
/data/pyt_env_1.12/
docker run --gpus all -it --name env_pyt_1.12 -v /data/pyt_env_1.12:/app nvcr.io/nvidia/pytorch:22.03-py3
4. 检查容器内是否成功安装上pytorch
# 容器内部检查pytorch可用性
$ python
>>> import torch
>>> torch.__version__
>>> print(torch.cuda.is_available())
True
5. 打开container新建一个python文件
import torch
import torchvision
print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
if torch.cuda.is_available():
device = torch.device("cuda")
print("CUDA is available on", device)
else:
print("CUDA is not available")
PyTorch version: 1.12.0a0+2c916ef
Torchvision version: 0.13.0a0
CUDA is available on cuda
成功!
6. 安装TensorRT的docker
我新建了一个文件夹叫trt_env文件夹用来挂载TensorRT的文件
docker run --gpus all -it --name env_trt -v /data/trt_env:/app nvcr.io/nvidia/tensorrt:22.08-py3
7. 创建自定义镜像
# 启动镜像
docker start env_trt
# 查看当前正在使用的容器
docker ps
# 停止使用容器
docker stop env_trt
# 创建自定义镜像
docker commit env_trt env_trt_img
# 未映射端口
docker run --gpus all -it --name env_trt -v $(pwd):/app nvcr.io/nvidia/tensorrt:22.08-py3
8. 腾讯云小问题
需要按照以下步骤添加 Docker 仓库的 GPG 密钥,不然没有办法sudo apt-get update
首先,尝试使用以下命令导入 GPG 密钥:
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
然后,将 Docker 仓库添加到您的系统源列表中:
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update