### helm插件安装
参考文章:https://www.cnblogs.com/ericnie/p/8463127.html
下载指定版本的tar.gz包
https://github.com/helm/helm/releases
解压放到PATH目录下
tar -xvzf $HELM.tar.gz
mv linux-amd64/helm /usr/local/bin/helm
创建用户角色
tiller.yaml
```javascript
apiVersion: v1
kind: ServiceAccount
metadata:
name: tiller
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: tiller
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: tiller
namespace: kube-system
```
$ kubectl create -f tiller.yaml
$ helm init --service-account tiller --upgrade -i registry.cn-hangzhou.aliyuncs.com/google_containers/tiller:v2.12.3 --skip-refresh
$ kubectl create serviceaccount --namespace kube-system tiller
$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
$ kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'
### FfDL部署
$ kubectl config set-context $(kubectl config current-context) --namespace=ivdai
export VM_TYPE=none
export PUBLIC_IP=< Cluster Public IP>
export NAMESPACE=ivdai
create nfs for pv
#Create the shared directory
$ sudo mkdir -p /data-nfs
#Install NFS kernel server
$ sudo apt update
$ sudo apt install -y nfs-kernel-server
#Update /etc/exports
$ sudo echo "/data-nfs *(rw,no_root_squash,no_subtree_check)" | sudo tee -a /etc/exports
#Restart NFS kernel server
$ sudo service nfs-kernel-server restart
test_pv.yaml
```javascript
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv0001
labels:
type: dlaas-static-volume
spec:
capacity:
storage: 200Gi
accessModes:
- ReadWriteMany
nfs:
path: /data-nfs
server: 192.168.8.110
```
$ kubectl create -f test_pv.yaml
$ helm install .
$ kubectl get pod
node_ip=$PUBLIC_IP
grafana_port=$(kubectl get service grafana -o jsonpath='{.spec.ports[0].nodePort}')
ui_port=$(kubectl get service ffdl-ui -o jsonpath='{.spec.ports[0].nodePort}')
restapi_port=$(kubectl get service ffdl-restapi -o jsonpath='{.spec.ports[0].nodePort}')
s3_port=$(kubectl get service s3 -o jsonpath='{.spec.ports[0].nodePort}')
echo "Monitoring dashboard: http://$node_ip:$grafana_port/ (login: admin/admin)"
echo "Web UI: http://$node_ip:$ui_port/#/login?endpoint=$node_ip:$restapi_port&username=test-user"
Using FfDL Local S3 Based Object Storage
s3_url=http://$node_ip:$s3_port
export AWS_ACCESS_KEY_ID=admin; export AWS_SECRET_ACCESS_KEY=password; export AWS_DEFAULT_REGION=us-east-1;
s3cmd="aws --endpoint-url=$s3_url s3"
$s3cmd mb s3://trainingdata
$s3cmd mb s3://trainedmodel
$s3cmd mb s3://mnist_lmdb_data
$s3cmd mb s3://dlaas-trained-models
mkdir tmp
for file in t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz;
do
test -e tmp/$file || wget -q -O tmp/$file http://yann.lecun.com/exdb/mnist/$file
$s3cmd cp tmp/$file s3://trainingdata/$file
done
restapi_port=$(kubectl get service ffdl-restapi -o jsonpath='{.spec.ports[0].nodePort}')
export DLAAS_URL=http://$node_ip:$restapi_port; export DLAAS_USERNAME=test-user; export DLAAS_PASSWORD=test;
if [ "$(uname)" = "Darwin" ]; then
sed -i '' s/s3.default.svc.cluster.local/$node_ip:$s3_port/ etc/examples/tf-model/manifest.yml
else
sed -i s/s3.default.svc.cluster.local/$node_ip:$s3_port/ etc/examples/tf-model/manifest.yml
fi
CLI_CMD=$(pwd)/cli/bin/ffdl-$(if [ "$(uname)" = "Darwin" ]; then echo 'osx'; else echo 'linux'; fi)
$CLI_CMD train etc/examples/tf-model/manifest.yml etc/examples/tf-model