--- # https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/ apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-init namespace: kube-system labels: root-init.k8s.exa.fi/component-name: nvidia-init spec: selector: matchLabels: root-init.k8s.exa.fi/component-name: nvidia-init template: metadata: labels: root-init.k8s.exa.fi/component-name: nvidia-init spec: tolerations: # this toleration is to have the daemonset runnable on master nodes # remove it if your masters can't run pods - key: node-role.kubernetes.io/master effect: NoSchedule initContainers: # this is implemented dorkily like this to cause it to be recreated # whenever there's a change. this is intended to eventually be moved # into a helm chart to use in argocd where this will make more sense - name: install-packages image: alpine:3.7 command: ["nsenter", "--mount=/proc/1/ns/mnt", "--", "sh", "-c"] args: - |- PCI_VENDORS="10de" VEN_FOUND=no for VEN in $PCI_VENDORS;do VEN=$(echo "$VEN" | tr A-Z a-z) if grep -Eq '^[0-9a-f]*\s*'"$VEN" /proc/bus/pci/devices;then VEN_FOUND=yes fi done if [ $VEN_FOUND = no ];then 1>&2 echo "no nvidia devices seen. skipping package init" exit 0 fi CUDA_KEYRING_PATH=/usr/share/keyrings/cuda-archive-keyring.gpg CUDA_LIST_PATH=/etc/apt/sources.list.d/cuda.list echo "$CUDA_KEYRING" | base64 -d > "$CUDA_KEYRING_PATH" echo "deb [signed-by=$CUDA_KEYRING_PATH] $CUDA_REPO /" > "$CUDA_LIST_PATH" NCT_KEYRING_PATH=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg NCT_LIST_PATH=/etc/apt/sources.list.d/nvidia-container-toolkit.list echo "$NCT_KEYRING" | base64 -d > "$NCT_KEYRING_PATH" echo "deb [signed-by=$NCT_KEYRING_PATH] $NCT_REPO /" > "$NCT_LIST_PATH" apt-get update apt-get install -y nvidia-kernel-dkms nvidia-container-toolkit cuda securityContext: privileged: true envFrom: - configMapRef: name: nvidia-init-artifacts containers: - name: finished-sleep-forever image: k8s.gcr.io/pause:3.1 securityContext: privileged: false terminationGracePeriodSeconds: 0 hostNetwork: true hostPID: true