kube-cascade/node-init/nvidia-init.yaml

70 lines
2.5 KiB
YAML

---
# https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-init
namespace: kube-system
labels:
root-init.k8s.exa.fi/component-name: nvidia-init
spec:
selector:
matchLabels:
root-init.k8s.exa.fi/component-name: nvidia-init
template:
metadata:
labels:
root-init.k8s.exa.fi/component-name: nvidia-init
spec:
tolerations:
# this toleration is to have the daemonset runnable on master nodes
# remove it if your masters can't run pods
- key: node-role.kubernetes.io/master
effect: NoSchedule
initContainers:
# this is implemented dorkily like this to cause it to be recreated
# whenever there's a change. this is intended to eventually be moved
# into a helm chart to use in argocd where this will make more sense
- name: install-packages
image: alpine:3.7
command: ["nsenter", "--mount=/proc/1/ns/mnt", "--", "sh", "-c"]
args:
- |-
PCI_VENDORS="10de"
VEN_FOUND=no
for VEN in $PCI_VENDORS;do
VEN=$(echo "$VEN" | tr A-Z a-z)
if grep -Eq '^[0-9a-f]*\s*'"$VEN" /proc/bus/pci/devices;then
VEN_FOUND=yes
fi
done
if [ $VEN_FOUND = no ];then
1>&2 echo "no nvidia devices seen. skipping package init"
exit 0
fi
CUDA_KEYRING_PATH=/usr/share/keyrings/cuda-archive-keyring.gpg
CUDA_LIST_PATH=/etc/apt/sources.list.d/cuda.list
echo "$CUDA_KEYRING" | base64 -d > "$CUDA_KEYRING_PATH"
echo "deb [signed-by=$CUDA_KEYRING_PATH] $CUDA_REPO /" > "$CUDA_LIST_PATH"
NCT_KEYRING_PATH=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
NCT_LIST_PATH=/etc/apt/sources.list.d/nvidia-container-toolkit.list
echo "$NCT_KEYRING" | base64 -d > "$NCT_KEYRING_PATH"
echo "deb [signed-by=$NCT_KEYRING_PATH] $NCT_REPO /" > "$NCT_LIST_PATH"
apt-get update
apt-get install -y nvidia-kernel-dkms nvidia-container-toolkit cuda
securityContext:
privileged: true
envFrom:
- configMapRef:
name: nvidia-init-artifacts
containers:
- name: finished-sleep-forever
image: k8s.gcr.io/pause:3.1
securityContext:
privileged: false
terminationGracePeriodSeconds: 0
hostNetwork: true
hostPID: true