70 lines
2.5 KiB
YAML
70 lines
2.5 KiB
YAML
---
|
|
# https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: nvidia-init
|
|
namespace: kube-system
|
|
labels:
|
|
root-init.k8s.exa.fi/component-name: nvidia-init
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
root-init.k8s.exa.fi/component-name: nvidia-init
|
|
template:
|
|
metadata:
|
|
labels:
|
|
root-init.k8s.exa.fi/component-name: nvidia-init
|
|
spec:
|
|
tolerations:
|
|
# this toleration is to have the daemonset runnable on master nodes
|
|
# remove it if your masters can't run pods
|
|
- key: node-role.kubernetes.io/master
|
|
effect: NoSchedule
|
|
initContainers:
|
|
# this is implemented dorkily like this to cause it to be recreated
|
|
# whenever there's a change. this is intended to eventually be moved
|
|
# into a helm chart to use in argocd where this will make more sense
|
|
- name: install-packages
|
|
image: alpine:3.7
|
|
command: ["nsenter", "--mount=/proc/1/ns/mnt", "--", "sh", "-c"]
|
|
args:
|
|
- |-
|
|
PCI_VENDORS="10de"
|
|
VEN_FOUND=no
|
|
for VEN in $PCI_VENDORS;do
|
|
VEN=$(echo "$VEN" | tr A-Z a-z)
|
|
if grep -Eq '^[0-9a-f]*\s*'"$VEN" /proc/bus/pci/devices;then
|
|
VEN_FOUND=yes
|
|
fi
|
|
done
|
|
if [ $VEN_FOUND = no ];then
|
|
1>&2 echo "no nvidia devices seen. skipping package init"
|
|
exit 0
|
|
fi
|
|
CUDA_KEYRING_PATH=/usr/share/keyrings/cuda-archive-keyring.gpg
|
|
CUDA_LIST_PATH=/etc/apt/sources.list.d/cuda.list
|
|
echo "$CUDA_KEYRING" | base64 -d > "$CUDA_KEYRING_PATH"
|
|
echo "deb [signed-by=$CUDA_KEYRING_PATH] $CUDA_REPO /" > "$CUDA_LIST_PATH"
|
|
|
|
NCT_KEYRING_PATH=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
|
NCT_LIST_PATH=/etc/apt/sources.list.d/nvidia-container-toolkit.list
|
|
echo "$NCT_KEYRING" | base64 -d > "$NCT_KEYRING_PATH"
|
|
echo "deb [signed-by=$NCT_KEYRING_PATH] $NCT_REPO /" > "$NCT_LIST_PATH"
|
|
|
|
apt-get update
|
|
apt-get install -y nvidia-kernel-dkms nvidia-container-toolkit cuda
|
|
securityContext:
|
|
privileged: true
|
|
envFrom:
|
|
- configMapRef:
|
|
name: nvidia-init-artifacts
|
|
containers:
|
|
- name: finished-sleep-forever
|
|
image: k8s.gcr.io/pause:3.1
|
|
securityContext:
|
|
privileged: false
|
|
terminationGracePeriodSeconds: 0
|
|
hostNetwork: true
|
|
hostPID: true
|