Gaiagpu
部署GaiaGPU
前置工作
- 配置好GPU环境
- 配置好k8s集群环境
gpu-admission
部署deployment
创建文件gpu-quota-admission.yaml
---
apiVersion:
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-admission
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-admission-as-kube-scheduler
subjects:
- kind: ServiceAccount
name: gpu-admission
namespace: kube-system
roleRef:
kind: ClusterRole
name: system:kube-scheduler
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-admission-as-volume-scheduler
subjects:
- kind: ServiceAccount
name: gpu-admission
namespace: kube-system
roleRef:
kind: ClusterRole
name: system:volume-scheduler
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-admission-as-daemon-set-controller
subjects:
- kind: ServiceAccount
name: gpu-admission
namespace: kube-system
roleRef:
kind: ClusterRole
name: system:controller:daemon-set-controller
apiGroup: rbac.authorization.k8s.io
---
# 创建控制器
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: gpu-quota-admission
name: gpu-quota-admission
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
k8s-app: gpu-quota-admission
template:
metadata:
labels:
k8s-app: gpu-quota-admission
namespace: kube-system
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: Exists
weight: 1
containers:
- env:
- name: LOG_LEVEL
value: "4"
- name: EXTRA_FLAGS
value: --incluster-mode=true
image: ccr.ccs.tencentyun.com/tkeimages/gpu-quota-admission:latest
imagePullPolicy: IfNotPresent
name: gpu-quota-admission
ports:
- containerPort: 3456
protocol: TCP
resources:
limits:
cpu: "2"
memory: 2Gi
requests:
cpu: "1"
memory: 1Gi
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
dnsPolicy: ClusterFirstWithHostNet
initContainers:
- command:
- sh
- -c
- ' mkdir -p /etc/kubernetes/ && cp /root/gpu-quota-admission/gpu-quota-admission.config
/etc/kubernetes/'
image: busybox
imagePullPolicy: Always
name: init-kube-config
securityContext:
privileged: true
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
priority: 2000000000
priorityClassName: system-cluster-critical
restartPolicy: Always
serviceAccount: gpu-admission
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- configMap:
defaultMode: 420
name: gpu-quota-admission
name: config
---
# 创建configMap
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-quota-admission
namespace: kube-system
data:
gpu-quota-admission.config: |
{
"QuotaConfigMapName": "gpuquota",
"QuotaConfigMapNamespace": "kube-system",
"GPUModelLabel": "gaia.tencent.com/gpu-model",
"GPUPoolLabel": "gaia.tencent.com/gpu-pool"
}
---
# 创建service
apiVersion: v1
kind: Service
metadata:
name: gpu-quota-admission
namespace: kube-system
spec:
ports:
- port: 3456
protocol: TCP
targetPort: 3456
selector:
k8s-app: gpu-quota-admission
type: ClusterIP
kubectl apply -f gpu-quota-admission.yaml
创建自定义调度文件
创建文件/etc/kubernetes/scheduler-policy-config.json