diff --git a/node-init/nvidia-init-artifacts.yaml b/node-init/nvidia-init-artifacts.yaml new file mode 100644 index 0000000..d8a994c --- /dev/null +++ b/node-init/nvidia-init-artifacts.yaml @@ -0,0 +1,71 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-init-artifacts + namespace: kube-system +data: + CUDA_REPO: https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ + NCT_REPO: https://nvidia.github.io/libnvidia-container/stable/deb/amd64 + NCT_KEYRING: | + mQINBFnNWDEBEACiX68rxIWvqH3h2GykO25oK9BAqV8fDtb6lXEbw3eKx4g87BRzM3DQBA0S0Ifk + Q72ovJ33H50+gVTXuu+Zme5muWk72m3pApccZVDLqdzYlpWPruNbMC+IlWr70yo8Jw8Zr1ihbWjF + vMbDJTkgqPt2djNq3xxvdiKoZlgnpLRKIpSu9iBQlNoZLHxTQKFH4219L77prRogv2QV1ckBL5lD + VOERJuHo4jHE8mm9/NZ6v3m2HGuuAEZ7T9nWlPGiAIP8Pww4ZRTJcBANcI2EFKPLdfP61HTH6w0k + VMkoAaGlemadTDl3ZcLpUpTFLc+ko/2uQ1qVPx9QYyoMrorS3kUmlXrhsA7FvcB09aIcb+JX6SVk + cbO5A5+baCa3owwUtFBXMHM5hqpLv4P3/GsuW6283YwLZCf53dJY4lJZePqzPGsvs/wSvhnZrFvb + 61i/Aqm0hjhVh7h6VNxUiE8geMcjxy29LtzajoyS0EPVxes4xZu0VbS78LQyCNHSpS7TFmtVUQmb + XqDN7cpiyr9+yutr0lZOMc7NYQt0nP/3RtYkWEob6wXarVImHas1OYzlZymdO1uAnqkediS61E2v + SD1OEq37/375FB/Q3AYXuNkQzDjYoJJz9wsv7Xp0bdPzQ/daLdIFNQXo5MmVIirsWM07JvbZaJhD + OiJxGn0MPf11/QARAQABtEBOVklESUEgQ09SUE9SQVRJT04gKE9wZW4gU291cmNlIFByb2plY3Rz + KSA8Y3VkYXRvb2xzQG52aWRpYS5jb20+iQI4BBMBCgAiBQJZzVgxAhsPBgsJCggHAwUVCgkICwUW + AgMBAAIeAQIXgAAKCRDdyuBE95bssAh6EACgUCww2sr8sOztEHKhvdCsonXuTHYbel3YlWmVDPbh + 4dA31xoRXlvSJptJzPi/zlTc9fkVSFGbEZbFRR4JjnwYTMLDElMh5YRMYAoPVYhWGKIO4earu32G + hFuPjfr6h+0xNaQeDPIbr7bPe/AEhLSdJMzIOuAifr7UaC65A6YlxfeaSqyt0HthYujoQ12cWxP9 + 98C5jkc0IN2tyLs/OD7HLHht+lafqDSylykx63cw7jvsV/15rqZwVwjhkcxZyrKET32MTjXF3cxn + 7+TGpKS8B1k4a/EI7uXnncfSoma0dAT9bZM9JZbXQmSzCPDHHuVtnQ/3uh8VyenpigTFnrb20LCy + 6WzJd3O9lAZXLhvwF/By3a07WLzRtTZNaUpt37Anb0js2syr3lohbmK9i3xvuqZNzhGPbqu9IV+v + FgSGyTHRJUSBlHKDGiCdOOHc20MLPW1yRCXbx0F4eS9TWchYyJkJNNczD5DnEl/gsvL4NCRxa+oU + yUhhJ1HpJ6YNmTsy6nAAKIC+6248o164GiavaR3z03RfaQayGHAUrBKi+PJBY7efgsZeYT8f+hyY + rIC04MO8poBKS/GvSUL2QtVtj59Nq+95gIptW2mZM8KRpt2huLH+QQ8SKr1vAECbpKJOwseqKmVy + xX02iaSE8ifLE+tXFE8YgS3CZjWwy5PD0LkBDQRdgpCQAQgAx1oxX9tFlv3CIva0CJ0dsZyNF7mg + HPgNszccUYLu0chyWYvwiVU/OlCzivytNX56wgeBgIVV1QzeBuTkrJSgzJ+dSgfrmyg5RwIDhvH+ + Dcut0++6+di1LyH9gXQcYPrN3pf4yR8nlRbm6K0Vsp0Z4+br18QelURerfAkRordag26aB+MzVLv + loHHu3Z6/v321uTGMdFd8CVCjovec5+EdcIAam3U/MmZe2mr2M/x6F3st30cE7umq9Bb6UCqc6L8 + bQcoloxR3rwFzL1u9wUBUzQlaMNmxbe0BfezkmSQeC8JN4Fku+DtHEpS9uP5JEYNEEQ66K4mJDTM + r0whBv1fKQARAQABiQNbBBgBCgAmAhsCFiEEyVsyG2HojBgJxPdZ3crgRPeW7LAFAl7oD1gFCQNG + skgBKcBdIAQZAQoABgUCXYKQkAAKCRBu2RyjrBFgzZ/WB/9TuD2qzaBO7HlPDWRUTpFlvFgyDc3X + yfTAC/ISeYbIcPcq5kmVHgpsMdbN9Vvmot5GuT7VWzhHc9sJCmHgL330glBtNtSRflKzlBYnbiSW + xLFYZtu2BtNOk8Ylbw8qw1E6W/iFBrqAwgeZvs2VOcPU3203Mqfi1JbS+YHC/bgs6cNq0zs/WJra + YxiuleclKYExxLt9tRd0058n58GAph+Ki7mRInO6kxuKpsQannSn1Ku/DiaQcSF2L2TMSo0N9zwv + YEZR+hgsKVqyRKT+DkZhusHJHYGv96YHSTwo016ZhwYS9t0MLXY9/PgJysuO41Ya4Ii43D3UK1wO + HTmyHZHTCRDdyuBE95bssDpwD/4jV9Pin3vAKa4hhn5GD4e478FNKRD58Q7qF3AhVTBNPIl1m4EF + X7sqI6cXUDG4BjpS70ZRWF2x51ZTiq7DLTV/gGw2okfVjoWjzQY0ebrLd4IoNs80lIHmXxa+JdwB + 6WupCUzKCKLcPsX/yPAmswPNGAuIMAv+PWhUUSMVtzOZldnlogGMhbJ9UD2txFGGh9WoYc2vgX9K + AaKryXcC6QMabv7JJU24HEJJDgbJEvtFM5PS8QMFbXIZsYgICWpQXVChBbduXo9sD2TUDWYAniNa + aw4LKxPRG+Ix4HAqkh1oNOLojO30DO3r1/62FKE5/ykg3iSMTDR0iOES/leXCCIO9fRJT8+eucxy + OQoY5ti7tjt1wm3HnTB+Rz3E/E2qeLs2PN82aseccm1G06pmsMCUiWtmSV6HjdO2XufYprrGLSu0 + RrT3sz5WHGUOY2iO40xHhSiXg3TcLZRpv30DQzxoUrx9Ff//rXLFznh+MksuvVD2roURBGz/en31 + FxAcBoex9nNraeOekbFen5b7Xrq9wnzM5xZvJN2QYB3vS0khz/ZgFyy5444ALa9gwb29FZCfA4m5 + 9S2QoB8uPQGM+8gnusE6J8y4fvI59ugafidIkt86dZ3mFsEME5XNmBGdNEo2flRVFfpG1IWds2Ba + 3IsdbYd9nzmbBW7/n0InVRDrIg== + CUDA_KEYRING: | + mQINBGJYmlEBEAC6nJmeqByeReM+MSy4palACCnfOg4pOxffrrkldxz4jrDOZNK4q8KG+ZbXrkdP + 0e9qTFRvZzN+A6Jw3ySfoiKXRBw5l2Zp81AYkghV641OpWNjZOyLsyKEtST9LR1ttHv1ZI71pj8N + VG/EnpimZPOblEJ1OpibJJCXLrbn+qcJ8JNuGTSK6v2aLBmhR8VR/aSJpmkg7fFjcGklweTI8+Ib + j72HuY9JRD/+dtUoSh7z037mWo56ee02lPFRD0pHOEAlLSXxFO/SDqRVMhcgHk0a8roCF+9h5Ni7 + ZUyxlGK/uHkqN7ED/U/ATpGKgvk4t23eTpdRC8FXAlBZQyf/xnhQXsyF/z7+RV5CL0o1zk1LKgo+ + 5K325ka5uZb6JSIrEPUaCPEMXu6EEY8zSFnCrRS/Vjkfvc9ViYZWzJ387WTjAhMdS7wdPmdDWw2A + SGUP4FrfCireSZiFX+ZAOspKpZdh0P5iR5XSx14XDt3jNK2EQQboaJADuqksItatOEYNu4JsCbc2 + 4roJvJtGhpjTnq1/dyoy6K433afU0DS2ZPLthLpGqeyKMKNY7a2WjxhRmCSu5Zok/fGKcO62XF8a + 3eSj4NzCRv8LM6mG1Oekz6Zz+tdxHg19ufHO0et7AKE5q+5VjE438Xpl4UWbM/Voj6VPJ9uzywDc + nZXpeOqeTQh2pQARAQABtCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCOQQTAQIA + IwUCYliaUQIbAwcLCQgHAwIBBhUIAgkKCwQWAgMBAh4BAheAAAoJEKS0aZY7+GPM1y4QALKhBqSo + zrYbe341Qu7SyxHQgjRCGi4YhI3bHCMj5F6vEOHnwiFH6YmFkxCYtqcGjca6iw7cCYMow/hgKLAP + wkwSJ84EYpGLWx62+20rMM4OuZwauSUcY/kE2WgnQ74zbh3+MHs56zntJFfJ9G+NYidvwDWeZn5H + IzR4CtxaxRgpiykg0s3ps6X0U+vuVcLnutBF7r81astvlVQERFbce/6KqHK+yj843Qrhb3JEolUo + OETK06nD25bVtnAxe0QEyA909MpRNLfR6BdjPpxqhphDcMOhJfyubAroQUxG/7S+Yw+mtEqHrL/d + z9iEYqodYiSozfi0b+HFI59sRkTfOBDBwb3kcARExwnvLJmqijiVqWkoJ3H67oA0XJN2nelucw+A + Hb+Jt9BWjyzKWlLFDnVHdGicyRJ0I8yqi32w8hGeXmu3tU58VWJrkXEXadBftmcipemb6oZ/r5SC + kW6kxr2PsNWcJoebUdynyOQGbVwpMtJAnjOYp0ObKOANbcIg+tsikyCIO5TiY3ADbBDPCeZK8xdc + ugXoW5WFwACGC0z+Cn0mtw8z3VGIPAMSCYmLusgWt2+EpikwrP2inNp5Pc+YdczRAsa4s30Jpyv/ + UHEG5P9GKnvofaxJgnU56lJIRPzFiCUGy6cVI0Fq777X/ME1K6A/bzZ4vRYNx8rUmVE5 diff --git a/node-init/nvidia-init.yaml b/node-init/nvidia-init.yaml new file mode 100644 index 0000000..7fabc46 --- /dev/null +++ b/node-init/nvidia-init.yaml @@ -0,0 +1,69 @@ +--- +# https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-init + namespace: kube-system + labels: + root-init.k8s.exa.fi/component-name: nvidia-init +spec: + selector: + matchLabels: + root-init.k8s.exa.fi/component-name: nvidia-init + template: + metadata: + labels: + root-init.k8s.exa.fi/component-name: nvidia-init + spec: + tolerations: + # this toleration is to have the daemonset runnable on master nodes + # remove it if your masters can't run pods + - key: node-role.kubernetes.io/master + effect: NoSchedule + initContainers: + # this is implemented dorkily like this to cause it to be recreated + # whenever there's a change. this is intended to eventually be moved + # into a helm chart to use in argocd where this will make more sense + - name: install-packages + image: alpine:3.7 + command: ["nsenter", "--mount=/proc/1/ns/mnt", "--", "sh", "-c"] + args: + - |- + PCI_VENDORS="10de" + VEN_FOUND=no + for VEN in $PCI_VENDORS;do + VEN=$(echo "$VEN" | tr A-Z a-z) + if grep -Eq '^[0-9a-f]*\s*'"$VEN" /proc/bus/pci/devices;then + VEN_FOUND=yes + fi + done + if [ $VEN_FOUND = no ];then + 1>&2 echo "no nvidia devices seen. skipping package init" + exit 0 + fi + CUDA_KEYRING_PATH=/usr/share/keyrings/cuda-archive-keyring.gpg + CUDA_LIST_PATH=/etc/apt/sources.list.d/cuda.list + echo "$CUDA_KEYRING" | base64 -d > "$CUDA_KEYRING_PATH" + echo "deb [signed-by=$CUDA_KEYRING_PATH] $CUDA_REPO /" > "$CUDA_LIST_PATH" + + NCT_KEYRING_PATH=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + NCT_LIST_PATH=/etc/apt/sources.list.d/nvidia-container-toolkit.list + echo "$NCT_KEYRING" | base64 -d > "$NCT_KEYRING_PATH" + echo "deb [signed-by=$NCT_KEYRING_PATH] $NCT_REPO /" > "$NCT_LIST_PATH" + + apt-get update + apt-get install -y nvidia-kernel-dkms nvidia-container-toolkit cuda + securityContext: + privileged: true + envFrom: + - configMapRef: + name: nvidia-init-artifacts + containers: + - name: finished-sleep-forever + image: k8s.gcr.io/pause:3.1 + securityContext: + privileged: false + terminationGracePeriodSeconds: 0 + hostNetwork: true + hostPID: true