---
# https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: nvidia-init
  namespace: kube-system
  labels:
    root-init.k8s.exa.fi/component-name: nvidia-init
spec:
  selector:
    matchLabels:
      root-init.k8s.exa.fi/component-name: nvidia-init
  template:
    metadata:
      labels:
        root-init.k8s.exa.fi/component-name: nvidia-init
    spec:
      tolerations:
      # this toleration is to have the daemonset runnable on master nodes
      # remove it if your masters can't run pods
      - key: node-role.kubernetes.io/master
        effect: NoSchedule
      initContainers:
      # this is implemented dorkily like this to cause it to be recreated
      # whenever there's a change.  this is intended to eventually be moved
      # into a helm chart to use in argocd where this will make more sense
      - name: install-packages
        image: alpine:3.7
        command: ["nsenter", "--mount=/proc/1/ns/mnt", "--", "sh", "-c"]
        args:
        - |-
          PCI_VENDORS="10de"
          VEN_FOUND=no
          for VEN in $PCI_VENDORS;do
              VEN=$(echo "$VEN" | tr A-Z a-z)
              if grep -Eq '^[0-9a-f]*\s*'"$VEN" /proc/bus/pci/devices;then
                  VEN_FOUND=yes
              fi
          done
          if [ $VEN_FOUND = no ];then
            1>&2 echo "no nvidia devices seen.  skipping package init"
            exit 0
          fi
          CUDA_KEYRING_PATH=/usr/share/keyrings/cuda-archive-keyring.gpg
          CUDA_LIST_PATH=/etc/apt/sources.list.d/cuda.list
          echo "$CUDA_KEYRING" | base64 -d > "$CUDA_KEYRING_PATH"
          echo "deb [signed-by=$CUDA_KEYRING_PATH] $CUDA_REPO /" > "$CUDA_LIST_PATH"

          NCT_KEYRING_PATH=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
          NCT_LIST_PATH=/etc/apt/sources.list.d/nvidia-container-toolkit.list
          echo "$NCT_KEYRING" | base64 -d > "$NCT_KEYRING_PATH"
          echo "deb [signed-by=$NCT_KEYRING_PATH] $NCT_REPO /" > "$NCT_LIST_PATH"

          apt-get update
          apt-get install -y nvidia-kernel-dkms nvidia-container-toolkit cuda
        securityContext:
          privileged: true
        envFrom:
        - configMapRef:
            name: nvidia-init-artifacts
      containers:
      - name: finished-sleep-forever
        image: k8s.gcr.io/pause:3.1
        securityContext:
          privileged: false
      terminationGracePeriodSeconds: 0
      hostNetwork: true
      hostPID: true