diff --git a/guix/import/cuda.scm b/guix/import/cuda.scm
new file mode 100644
index 0000000..55cf0ed
--- /dev/null
+++ b/guix/import/cuda.scm
@@ -0,0 +1,189 @@
+;;; SPDX-License-Identifier: GPL-3.0-or-later
+;;; Copyright © 2025 Nicolas Graves <ngraves@ngraves.fr>
+
+;;; This file is not part of GNU Guix but requires this naming scheme
+;;; so that the %cuda-updater is properly read when using
+;;; `guix refresh -L$(pwd) cuda-cccl' in nonguix root.
+
+(define-module (guix import cuda)
+  #:use-module (gcrypt hash)
+  #:use-module (gnu packages)
+  #:use-module (guix base16)
+  #:use-module (guix base32)
+  #:use-module (guix http-client)
+  #:use-module (guix import json)
+  #:use-module (guix import utils)
+  #:use-module (guix memoization)
+  #:use-module (guix packages)
+  #:use-module (guix records)
+  #:use-module (guix upstream)
+  #:use-module (guix utils)
+  #:use-module (ice-9 match)
+  #:use-module (ice-9 regex)
+  #:use-module (ice-9 textual-ports)
+  #:use-module (json)
+  #:use-module (nonguix build-system cuda)
+  #:use-module (srfi srfi-1)
+  #:export (%cuda-updater))
+
+(define %cuda-repository-url
+  "https://developer.download.nvidia.com/compute/cuda/redist/")
+
+(define (cuda-system->guix-system system)
+  (match system
+    ("linux-x86_64" "x86_64-linux")
+    ("linux-aarch64" "aarch64-linux")
+    ("linux-ppc64le" "powerpc64le-linux")
+    (_ #f)))
+
+(define (valid-version? version-string)
+  (false-if-exception (version-prefix version-string 3)))
+
+(define-record-type* <cuda-package>
+  cuda-package make-cuda-package
+  cuda-package? this-cuda-package
+  (name cuda-package-name)
+  (version cuda-package-version valid-version?)
+  (hash-info cuda-package-hash-info cuda-hash-info?))
+
+(define-record-type* <cuda-hash-info>
+  cuda-hash-info make-cuda-hash-info
+  cuda-hash-info? this-cuda-hash-info
+  (system cuda-hash-info-system)
+  (sha256 cuda-hash-info-sha256))
+
+(define (cuda-toolkit-latest-version)
+  (let* ((url (string-append %cuda-repository-url "index.html"))
+         (port (http-fetch url #:text? #t))  ; FIXME no error management
+         (html (get-string-all port))
+         (regex "redistrib_[0-9.]*.json")
+         (file-string
+          (fold-matches regex html ""
+                        (lambda (matching void)
+                          (match:substring matching))))
+         (version-string
+          (string-drop-right
+           (string-drop file-string (string-length "redistrib_"))
+           (string-length ".json"))))
+    (close-port port)
+    version-string))
+
+(define (cuda-json-pkg-alist->cuda-package cuda-pkg-alist)
+  (make-cuda-package
+   (snake-case (first cuda-pkg-alist))
+   (assoc-ref cuda-pkg-alist "version")
+   (filter
+    identity
+    (map (lambda (system)
+           (let ((inner-alist (assoc-ref cuda-pkg-alist system)))
+             (if inner-alist
+                 (make-cuda-hash-info
+                  (cuda-system->guix-system system)
+                  (bytevector->nix-base32-string
+                   (base16-string->bytevector
+                    (assoc-ref inner-alist "sha256"))))
+                 #f)))
+         (list "linux-x86_64" "linux-aarch64" "linux-ppc64le")))))
+
+(define cuda-db-fetch
+  (memoize
+   (lambda (toolkit-version)
+     (map
+      cuda-json-pkg-alist->cuda-package
+      (filter list?
+              (json-fetch
+               (string-append %cuda-repository-url
+                              "redistrib_" toolkit-version ".json")))))))
+
+(define (cuda-fetch name tk-version)
+  (let ((candidates (filter
+                     (lambda (pkg) (equal? (cuda-package-name pkg) name))
+                     (cuda-db-fetch tk-version))))
+    (and (not (null? candidates)) (car candidates))))
+
+(define* (latest-release package #:key (version #f))
+  "Return an <upstream-source> for the latest-release of PACKAGE."
+  (let* ((name (package-name package))
+         (version (or version (cuda-toolkit-latest-version)))
+         (package (cuda-fetch name version))
+         (version (and=> package cuda-package-version)))
+    (and version
+         (upstream-source
+          (package name)
+          (version version)
+          (urls (list (cuda-module-url name version)))))))
+
+(define (make-cuda-sexp cuda-package)
+  `(define-public ,(string->symbol (cuda-package-name cuda-package))
+     (package
+       (name ,(cuda-package-name cuda-package))
+       (version ,(cuda-package-version cuda-package))
+       (source
+        (origin
+          (method url-fetch)
+          (uri (cuda-module-url name version))
+          (sha256
+           (base32
+            (match (or (%current-target-system) (%current-system))
+              ,@(map
+                 (lambda (info)
+                   (list (cuda-hash-info-system info)
+                         (cuda-hash-info-sha256 info)))
+                 (cuda-package-hash-info cuda-package)))))))
+       (build-system cuda-build-system)
+       (synopsis #f)
+       (description #f)
+       (home-page "https://developer.nvidia.com/cuda-toolkit")
+       (license (cuda-license name)))))
+
+(define (guix-name->cuda-name package)
+  (string-join (string-split package  #\-) "_"))
+
+(define (cuda-package? package)
+  "Return true if PACKAGE is a CUDA Toolkit package."
+  (eq? (package-build-system package) cuda-build-system))
+
+(define %cuda-updater
+  (upstream-updater
+   (name 'cuda)
+   (description "Updater for Cuda packages")
+   (pred cuda-package?)
+   (import latest-release)))
+
+;; The issue with guix native importer is that it will only update
+;; the x64_86 hash, but we do have different sources based on
+;; (%current-target-system).
+
+;; To update all hashes of a package, use:
+;; (make-cuda-sexp (cuda-fetch "cuda-profiler-api" "12.1.1"))
+
+;; To update all hashes of all packages, use:
+;; (use-modules (ice-9 pretty-print))
+;; (for-each
+;;  (lambda (name)
+;;    (pretty-print (make-cuda-sexp (cuda-fetch name "12.1.1"))))
+;;  '("cuda-cccl"
+;;    "cuda-cudart"
+;;    "cuda-cuobjdump"
+;;    "cuda-cuxxfilt"
+;;    "cuda-cupti"
+;;    "cuda-gdb"
+;;    "cuda-nvcc"
+;;    "cuda-nvml-dev"
+;;    "cuda-nvdisasm"
+;;    "cuda-nvprune"
+;;    "cuda-nvrtc"
+;;    "cuda-nvtx"
+;;    "cuda-opencl"
+;;    "cuda-sanitizer-api"
+;;    "libcublas"
+;;    "libcufft"
+;;    "libcurand"
+;;    "libcusolver"
+;;    "libcusparse"
+;;    ;; "libnvfatbin"
+;;    "libnvjitlink"
+;;    "libnvjpeg"
+;;    "libnpp"))
+
+;; cuda.scm ends here.
diff --git a/nongnu/packages/machine-learning.scm b/nongnu/packages/machine-learning.scm
new file mode 100644
index 0000000..9c411fd
--- /dev/null
+++ b/nongnu/packages/machine-learning.scm
@@ -0,0 +1,232 @@
+;;; SPDX-License-Identifier: GPL-3.0-or-later
+;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>
+
+(define-module (nongnu packages machine-learning)
+  #:use-module ((guix licenses) #:prefix license:)
+  #:use-module (guix gexp)
+  #:use-module (guix packages)
+  #:use-module (guix utils)
+  #:use-module (guix build-system cmake)
+  #:use-module (guix build-system copy)
+  #:use-module (guix build-system gnu)
+  #:use-module (guix build-system python)
+  #:use-module (guix git-download)
+  #:use-module (gnu packages)
+  #:use-module (gnu packages check)
+  #:use-module (gnu packages cpp)
+  #:use-module (gnu packages libevent)
+  #:use-module (gnu packages machine-learning)
+  #:use-module (gnu packages pkg-config)
+  #:use-module (gnu packages python-xyz)
+  #:use-module (gnu packages serialization)
+  #:use-module (nongnu packages nvidia)
+  #:use-module (ice-9 match))
+
+(define-public gloo-cuda
+  (let ((version "0.0.0")                         ; no proper version tag
+        (commit "e6d509b527712a143996f2f59a10480efa804f8b")
+        (revision "2"))
+    (package
+      (name "gloo-cuda")
+      (version (git-version version revision commit))
+      (source
+       (origin
+         (method git-fetch)
+         (uri (git-reference
+               (url "https://github.com/facebookincubator/gloo")
+               (commit commit)))
+         (file-name (git-file-name name version))
+         (sha256
+          (base32
+           "11ywsn1lrd1cpzm1iwvin2c5l962zib5bd852vl54bp12a0w6klj"))))
+      (build-system cmake-build-system)
+      (native-inputs
+       (list googletest))
+      (inputs
+       (modify-inputs (package-inputs gloo)
+         (append cuda-toolkit nvidia-nccl)))
+      (arguments
+       (substitute-keyword-arguments (package-arguments gloo)
+         ((#:configure-flags flags ''())
+          #~(cons "-DUSE_CUDA=ON" #$flags))))
+      (synopsis "Collective communications library")
+      (description
+       "Gloo is a collective communications library.  It comes with a
+number of collective algorithms useful for machine learning applications.
+These include a barrier, broadcast, and allreduce.
+
+Note: This package provides NVIDIA GPU support.")
+      (home-page "https://github.com/facebookincubator/gloo")
+      (license license:bsd-3))))
+
+(define %python-pytorch-version "2.4.0")
+
+(define %python-pytorch-src
+  (origin
+    (method git-fetch)
+    (uri (git-reference
+          (url "https://github.com/pytorch/pytorch")
+          (commit (string-append "v" %python-pytorch-version))))
+    (file-name (git-file-name "python-pytorch" %python-pytorch-version))
+    (sha256
+     (base32
+      "18hdhzr12brj0b7ppyiscax0dbra30207qx0cckw78midfkcn7cn"))
+    (patches (search-patches "python-pytorch-system-libraries.patch"
+                             "python-pytorch-runpath.patch"
+                             "python-pytorch-without-kineto.patch"
+                             ;; Some autogeneration scripts depend on the
+                             ;; compile PyTorch library. Therefore, we create
+                             ;; dummy versions which are regenerated later.
+                             "python-pytorch-fix-codegen.patch"))
+    (modules '((guix build utils)))
+    (snippet
+     '(begin
+        ;; Bundled or unused code
+        (for-each
+         (lambda (dir)
+           (when (file-exists? dir)
+             (delete-file-recursively dir)))
+         '("android"
+           ;; "aten/src/ATen/native/cuda/cutlass_extensions"
+           "aten/src/ATen/native/quantized/cpu/qnnpack"
+           "caffe2/mobile/contrib/libopencl-stub"
+           "caffe2/mobile/contrib/libvulkan-stub"
+           "third_party"))
+
+        ;; Autogenerated files
+        (for-each
+         delete-file
+         '("aten/src/ATen/nnapi/nnapi_wrapper.cpp"
+           "aten/src/ATen/nnapi/nnapi_wrapper.h"
+           ;; These files contain just lists of floating point values and
+           ;; might be as well hand-written.
+           ;; "test/cpp/api/init_baseline.h"
+           ;; "test/cpp/api/optim_baseline.h"
+           "test/mobile/test_upgrader_bytecode_table_example.cpp"
+           "torch/csrc/jit/mobile/upgrader_mobile.cpp"
+           "torch/csrc/jit/runtime/decomposition_registry_util.cpp"
+           "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp"
+           "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp"
+           "torch/csrc/jit/serialization/mobile_bytecode_generated.h"))
+        (delete-file-recursively ".github")
+        ;; These files are needed for CUDA.
+        ;; (for-each
+        ;;  (lambda (dir)
+        ;;    (for-each
+        ;;     delete-file
+        ;;     (find-files dir "\\.cu$")))
+        ;;  '("aten/src/ATen/native/transformers/cuda/flash_attn/kernels"
+        ;;    "aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels"))
+        ))))
+
+(define-public python-pytorch-cuda
+  (package
+    (name "python-pytorch-cuda")
+    (version %python-pytorch-version)
+    (source %python-pytorch-src)
+    (build-system python-build-system)
+    (arguments
+     (substitute-keyword-arguments (package-arguments python-pytorch)
+       ((#:phases phases)
+        #~(modify-phases #$phases
+            (add-after 'cmake-patches 'cuda-cmake-patches
+              (lambda _
+                ;; XXX: Currently nvidia-cudnn-frontend doesn't install CMake
+                ;; configuration files, we must add unbundled nlohmann-json.
+                ;; Additionally, it won't work without CUDNN_INCLUDE_DIR.
+                (substitute* "cmake/Dependencies.cmake"
+                  (("set\\(CUDNN_FRONTEND_INCLUDE_DIR.*$")
+                   (format #f "set(CUDNN_FRONTEND_INCLUDE_DIR ~a/include)
+  target_include_directories(torch::cudnn INTERFACE
+      ${CUDNN_INCLUDE_DIR} ${~a/include}
+  )~%"
+                           #$(this-package-input "nvidia-cudnn-frontend")
+                           #$(this-package-input "nlohmann-json"))))
+                ;; XXX: Link the right include dir for cutlass.
+                (substitute* "aten/src/ATen/CMakeLists.txt"
+                  (("\
+\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/\\.\\./\\.\\./\\.\\./third_party/cutlass")
+                   #$(this-package-input "nvidia-cutlass")))
+                ;; XXX: Not linking gtest+gtest_main breaks compilation
+                (substitute* '("c10/cuda/test/CMakeLists.txt"
+                               "caffe2/CMakeLists.txt")
+                  (("target_link_libraries\\((.* gtest_main)\\)" all content)
+                   (format #f "target_link_libraries(~a gtest)"
+                           content)))))
+            (add-after 'use-system-libraries 'use-cuda-libraries
+              (lambda _
+                (setenv "USE_CUDA" "1")
+                (setenv "CUDA_HOME" #$(this-package-input "cuda-dev"))
+                (setenv "CUDA_TOOLKIT_ROOT_DIR"
+                        #$(this-package-input "cuda-dev"))
+                (setenv "CUDA_USE_STATIC_CUDA_RUNTIME" "0")
+                (setenv "CUDA_PROPAGATE_HOST_FLAGS" "0")
+                (setenv "CUSPARSELT_LIBRARY"
+                        #$(file-append
+                           (this-package-input "cuda-dev") "/lib"))
+                (setenv "CUSPARSELT_INCLUDE_DIR"
+                        #$(file-append
+                           (this-package-input "cuda-dev") "/include"))
+                (setenv "USE_CUDNN" "1")
+                (setenv "CUDNN_LIB_DIR"
+                        #$(file-append
+                           (this-package-input "nvidia-cudnn") "/lib"))
+                (setenv "CUDNN_INCLUDE_DIR"
+                        #$(file-append
+                           (this-package-input "nvidia-cudnn") "/include"))
+                ;; XXX: 3.5, 5.0 and 9.0a break tests compilation
+                ;; See https://github.com/pytorch/pytorch/issues/113948
+                (setenv "TORCH_CUDA_ARCH_LIST" "8.0 8.6 8.9 9.0")
+                ;; XXX: Current cutlass package doesn't have necessary
+                ;; headers to enable this option.
+                (setenv "USE_ROCM" "0")))))))
+    (native-inputs (package-native-inputs python-pytorch))
+    (inputs
+     (modify-inputs (package-inputs python-pytorch)
+       (replace "tensorpipe" tensorpipe-cuda)
+       (replace "gloo" gloo-cuda)
+       (append nvidia-cudnn
+               nvidia-cudnn-frontend
+               cuda-dev
+               nlohmann-json
+               nvidia-cutlass
+               nvidia-nccl)))
+    (propagated-inputs (package-propagated-inputs python-pytorch))
+    (home-page "https://pytorch.org/")
+    (synopsis "Python library for tensor computation and deep neural networks")
+    (description
+     "PyTorch is a Python package that provides two high-level features:
+
+@itemize
+@item tensor computation (like NumPy) with strong GPU acceleration;
+@item deep neural networks (DNNs) built on a tape-based autograd system.
+@end itemize
+
+You can reuse Python packages such as NumPy, SciPy, and Cython to extend
+PyTorch when needed.
+
+Note: This package provides NVIDIA GPU support.")
+    (license license:bsd-3)))
+
+(define-public tensorpipe-cuda
+  (package
+    (name "tensorpipe-cuda")
+    (version (package-version tensorpipe))
+    (source (package-source tensorpipe))
+    (build-system cmake-build-system)
+    (arguments
+     (list
+      #:configure-flags
+      ''("-DBUILD_SHARED_LIBS=ON" "-DTP_USE_CUDA=1")
+        ;; There are no tests
+        #:tests? #f))
+    (inputs (list cuda-nvml-dev cuda-toolkit libuv))
+    (native-inputs (list googletest pkg-config pybind11 libnop))
+    (home-page "https://github.com/pytorch/tensorpipe")
+    (synopsis "Tensor-aware point-to-point communication primitive for
+machine learning")
+    (description "TensorPipe provides a tensor-aware channel to transfer
+rich objects from one process to another while using the fastest transport for
+the tensors contained therein.
+Note: This version includes NVIDIA CUDA API and headers.")
+    (license license:bsd-3)))
diff --git a/nongnu/packages/nvidia.scm b/nongnu/packages/nvidia.scm
index bbdebed..959a274 100644
--- a/nongnu/packages/nvidia.scm
+++ b/nongnu/packages/nvidia.scm
@@ -7,6 +7,7 @@
 ;;; Copyright © 2022, 2023 Petr Hodina <phodina@protonmail.com>
 ;;; Copyright © 2022 Alexey Abramov <levenson@mmer.org>
 ;;; Copyright © 2022, 2023, 2024 Hilton Chain <hako@ultrarare.space>
+;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>
 
 (define-module (nongnu packages nvidia)
   #:use-module (guix packages)
@@ -14,6 +15,7 @@
   #:use-module (guix download)
   #:use-module (guix gexp)
   #:use-module (guix git-download)
+  #:use-module (guix build utils)
   #:use-module (guix utils)
   #:use-module ((guix licenses) #:prefix license-gnu:)
   #:use-module ((nonguix licenses) #:prefix license:)
@@ -22,24 +24,36 @@
   #:use-module (guix build-system copy)
   #:use-module (guix build-system gnu)
   #:use-module (guix build-system meson)
+  #:use-module (guix build-system pyproject)
   #:use-module (guix build-system python)
   #:use-module (guix build-system trivial)
+  #:use-module (nonguix build-system cuda)
+  #:use-module (gnu packages)
   #:use-module (gnu packages base)
   #:use-module (gnu packages bash)
   #:use-module (gnu packages bootstrap)
   #:use-module (gnu packages check)
+  #:use-module (gnu packages cmake)
   #:use-module (gnu packages compression)
+  #:use-module (gnu packages cpp)
   #:use-module (gnu packages elf)
   #:use-module (gnu packages freedesktop)
   #:use-module (gnu packages gawk)
   #:use-module (gnu packages gcc)
   #:use-module (gnu packages gl)
   #:use-module (gnu packages glib)
+  #:use-module (gnu packages graphviz)
   #:use-module (gnu packages gtk)
   #:use-module (gnu packages linux)
   #:use-module (gnu packages m4)
   #:use-module (gnu packages lsof)
+  #:use-module (gnu packages machine-learning)
+  #:use-module (gnu packages multiprecision)
   #:use-module (gnu packages pkg-config)
+  #:use-module (gnu packages python)
+  #:use-module (gnu packages python-build)
+  #:use-module (gnu packages python-check)
+  #:use-module (gnu packages python-science)
   #:use-module (gnu packages python-xyz)
   #:use-module (gnu packages qt)
   #:use-module (gnu packages tls)
@@ -50,7 +64,8 @@
   #:use-module (gnu packages xml)
   #:use-module (gnu packages xorg)
   #:use-module (nongnu packages linux)
-  #:use-module (ice-9 match))
+  #:use-module (ice-9 match)
+  #:use-module (gcrypt base16))
 
 (define-public %nvidia-environment-variable-regexps
   '("^__GL_"                            ; NVIDIA OpenGL settings.
@@ -856,6 +871,1310 @@ variables @code{__GLX_VENDOR_LIBRARY_NAME=nvidia} and
 (define-public replace-mesa
   (package-input-rewriting `((,mesa . ,mesa/fake))))
 
+
+
+;;;
+;;; CUDA packages
+;;;
+
+
+(define-public cuda-cccl
+  (package
+    (name "cuda-cccl")
+    (version "12.1.109")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1ahvk632nh05m3mmjk8mhkxgkmry1ipq89dycw98kd617png6kmq")
+           ("aarch64-linux"
+            "1yc5irxn35ii0qal1qi8v6gq25ws4a7axjnmc5b20g0ypzxdlc2n")
+           ("powerpc64le-linux"
+            "0s6zidp5ajsqh519x3c38ihip4m1hkdzhrsdq04pybk8sfjh7z2l"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib"))))
+    (synopsis
+     "C++ Core Compute Libraries for the CUDA language")
+    (description
+     "This package provides the CUDA C++ developers with building blocks that
+make it easier to write safe and efficient code.  It unifies three essential former
+CUDA C++ libraries into a single repository:
+@itemize
+@item Thrust (former repo)
+@item CUB (former repo)
+@item libcudacxx (former repo)
+@end itemize")
+    (home-page "https://developer.nvidia.com/cuda-toolkit")
+    (license (cuda-license name))))
+
+(define-public cuda-cudart
+  (package
+    (name "cuda-cudart")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1nbbmd3x0dm3qpyr99cdmbw2gwffvvr9qvlwsdc34i4cij3yr5k0")
+           ("aarch64-linux"
+            "1q8mrsvj5w4v81w7fs73jq1z0ilishkfg5pq5ncb85yjg345hwya")
+           ("powerpc64le-linux"
+            "1ffqr6d28rpwzx9swmwj8s6p8llfvwrzpnnjcgjgskqygf5lfl2y"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))
+           #:phases
+           #~(modify-phases %standard-phases
+               (delete 'install-static)
+               (add-after 'install 'add-symlink
+                 (lambda _
+                   (with-directory-excursion
+                       (string-append #$output "/lib/stubs")
+                     (symlink "libcuda.so" "libcuda.so.1")))))))
+    (inputs (list cuda-nvrtc `(,gcc "lib") glibc))
+    (synopsis "CUDA runtime")
+    (description
+     "This package provides the CUDA run-time support libraries for NVIDIA
+GPUs, all of which are proprietary.")
+    (home-page "https://developer.nvidia.com/cuda-toolkit")
+    (license (cuda-license name))))
+
+(define-public cuda-cuobjdump
+  (package
+    (name "cuda-cuobjdump")
+    (version "12.1.111")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0lnsmz06kim978lcfbyl1n58883wq76wjri7kazrdr1bmj6vb60h")
+           ("aarch64-linux"
+            "0dqis4m2wlplp5hzjn92q65vs8gshn4nc7200gyvdr7midqcw0xz")
+           ("powerpc64le-linux"
+            "118ipzj28i4668jpr3svnzw5r3hgmwvg618s6y3axfn5picv4f4q"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("bin" "bin"))))
+    (synopsis "Extract information from CUDA binary files")
+    (description
+     "This binary extracts information from CUDA binary files (both standalone
+and those embedded in host binaries) and presents them in human readable
+format.  The output of @code{cuobjdump} includes CUDA assembly code for each
+kernel, CUDA ELF section headers, string tables, relocators and other CUDA
+specific sections.  It also extracts embedded ptx text from host binaries.")
+    (home-page "https://docs.nvidia.com/cuda/\
+cuda-binary-utilities/index.html#cuobjdump")
+    (license (cuda-license name))))
+
+(define-public cuda-cuxxfilt
+  (package
+    (name "cuda-cuxxfilt")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0va13gfay4as0fnc23n0gqhnylyhykp5cmmxjhlminfi735zki0x")
+           ("aarch64-linux"
+            "15jbqssx0nzi8l411m41393jpzc8kbd2qa0jri22cp5c4cnls9bz")
+           ("powerpc64le-linux"
+            "0m3nmsl59r2apd1dpm3a8ch788kq2krrl1x50agqk3z2wl8zhy1p"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("bin" "bin")
+                             ("include" "include")
+                             ("lib" "lib"))))
+    (synopsis "Decodes low-level CUDA C++ identifiers into readable names")
+    (description
+     "This package decodes (demangles) low-level identifiers that have been
+mangled by CUDA C++ into user readable names.  For every input alphanumeric
+word, the output of cu++filt is either the demangled name if the name decodes
+to a CUDA C++ name, or the original name itself.")
+    (home-page "https://docs.nvidia.com/cuda/\
+cuda-binary-utilities/index.html#cu-filt")
+    (license (cuda-license name))))
+
+(define-public cuda-cupti
+  (package
+    (name "cuda-cupti")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0qy3pvqkvr16xp2l0jb202xxvgq1pxdwkqfrpm4ag6k102i98x9r")
+           ("aarch64-linux"
+            "14j7kb6izvvgmla92lxyhlw482v7hxqsfpcl4gvpg6nspa0p6vbs")
+           ("powerpc64le-linux"
+            "0rfkvvv0i8450bpmanbq72cg98grpskxdrwswj7zch9gwkh4qyhr"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("doc" "share/doc")
+                             ("lib" "lib")
+                             ("samples" "share/samples"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "CUDA Profiling Tools Interface")
+    (description
+     "This package enables the creation of profiling and tracing tools that
+target CUDA applications and give insight into the CPU and GPU behavior of
+CUDA applications.  It provides the following APIs:
+@itemize
+@item the Activity API,
+@item the Callback API,
+@item the Event API,
+@item the Metric API,
+@item the Profiling API,
+@item the PC Sampling API,
+@item the Checkpoint API.
+@end itemize")
+    (home-page "https://docs.nvidia.com/cuda/cupti/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-gdb
+  (package
+    (name "cuda-gdb")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0205f2ix06ry404l0ymrwx23k3nsnvhm1clg52hsnxmzqplfmgn4")
+           ("aarch64-linux"
+            "1v8cprz20yqjy8g1s9rbrvly1dr5icfam7c8rzqvzs25l8dcynjw")
+           ("powerpc64le-linux"
+            "1l2gl6pcvmdqcvd45513in915ij9cf9ljii5vfgh1y13apnk8ykz"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan
+           ``(("bin" "bin")
+              ("extras/Debugger/include" "include")
+              ("extras/Debugger/lib64" "lib")
+              ("share/gdb/python"
+               ,,(string-append "lib/python"
+                                (version-major+minor (package-version python))
+                                "/site-packages/gdb")))
+           #:strip-binaries? #f  ; FIXME breaks 'validate-runpath
+           #:patchelf-inputs ''("gcc" "glibc" "gmp")))
+    (inputs (list `(,gcc "lib") glibc gmp))
+    (synopsis "Tool for debugging CUDA applications")
+    (description
+     "This package provides the NVIDIA tool for debugging CUDA applications
+running.  CUDA-GDB is an extension to GDB, the GNU Project debugger.  The tool
+provides developers with a mechanism for debugging CUDA applications running
+on actual hardware.  This enables developers to debug applications without the
+potential variations introduced by simulation and emulation environments.")
+    (home-page "https://docs.nvidia.com/cuda/cuda-gdb/index.html")
+    (license (cuda-license name))))
+
+;; This package must be defined before cuda-nvcc for inheritance.
+(define-public libnvvm
+  (package
+    (name "libnvvm")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url "cuda-nvcc" version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0fq8w5jq2drckjwn2i30m7arybnffhy4j2qb2yysp23pw7pgg18b")
+           ("aarch64-linux"
+            "0di51rdd08fwg6as1fqixkw7g052qv3sx9f9y06dkdbq0i563y0n")
+           ("powerpc64le-linux"
+            "1830cvqpmjsv83wk1lfjpjlc8j3wdpaiyvvc03crqh241v4c9qp6"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list
+      #:strip-binaries? #f  ; XXX: breaks 'validate-runpath phase
+      #:install-plan ''(("nvvm/bin" "/bin")
+                        ("nvvm/include" "/include")
+                        ("nvvm/lib64" "/lib")
+                        ;; nvvm prefix is necessary for cmake
+                        ("nvvm/libdevice" "nvvm/libdevice"))))
+    (inputs (list cuda-cudart `(,gcc-12 "lib") glibc))
+    (synopsis
+     "Generate CUDA PTX code from binary or text inputs")
+    (description
+     "This package provides an interface for generating PTX code from both
+binary and text NVVM IR inputs.")
+    (home-page "https://docs.nvidia.com/cuda/libnvvm-api/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-nvcc
+  (package
+    (inherit libnvvm)
+    (name "cuda-nvcc")
+    (arguments
+     (list
+      #:strip-binaries? #f  ; XXX: breaks 'validate-runpath phase
+      #:patchelf-inputs ''("gcc" "glibc" "libnvvm")
+      #:install-plan ''(("bin" "bin")
+                        ("include" "include")
+                        ("lib" "lib"))
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-after 'unpack 'patch-nvcc.profile
+            (lambda _
+              (define (append-to-file name body)
+                (let ((file (open-file name "a")))
+                  (display body file)
+                  (close-port file)))
+
+              (substitute* "bin/nvcc.profile"
+                (("\\$\\(TOP\\)/\\$\\(_NVVM_BRANCH_\\)")
+                 #$(this-package-input "libnvvm"))
+                (("\\$\\(TOP\\)/lib")
+                 (string-append #$output "/lib"))
+                (("\\$\\(TOP\\)/nvvm")
+                 (string-append #$output "/nvvm"))
+                (("\\$\\(TOP\\)/\\$\\(_TARGET_DIR_\\)/include")
+                 (string-append #$output "/include")))
+              (append-to-file
+               "bin/nvcc.profile"
+               (string-join
+                (list
+                 (string-append "PATH += " #$(this-package-input "gcc") "/bin")
+                 (string-append
+                  "LIBRARIES =+ -L"
+                  #$(this-package-input "cuda-cudart") "/lib -L"
+                  #$(this-package-input "cuda-cudart") "/lib/stubs -L"
+                  #$(this-package-input "libnvvm") "/lib")
+                 (string-append
+                  "INCLUDES =+ -I"
+                  #$(this-package-input "cuda-cudart") "/include -I"
+                  #$(this-package-input "libnvvm") "/include\n"))
+                "\n")))))))
+    (inputs (list cuda-cudart `(,gcc "lib") glibc libnvvm))
+    (synopsis
+     "Compiler for the CUDA language and associated run-time support")
+    (description
+     "This package provides the CUDA compiler and the CUDA run-time support
+libraries for NVIDIA GPUs, all of which are proprietary.")
+    (home-page "https://docs.nvidia.com/cuda/\
+cuda-compiler-driver-nvcc/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-nvml-dev
+  (package
+    (name "cuda-nvml-dev")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0zyp4c4zf4kjjdw0dzjncclyamazlg5z4lncl7y0g8bq3idpgbi0")
+           ("aarch64-linux"
+            "0wal0bjvhd9wr4cnvr4s9m330awj2mqqvpq0rh6wzaykas40zmcx")
+           ("powerpc64le-linux"
+            "1zjh6mmp5nl3s5wm5jwfzh9bazzhl2vr76c9cdfrjjryyd2pkr92"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("nvml/example" "share/example")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "NVIDIA Management Library Headers")
+    (description
+     "The NVIDIA Management Library Headers (NVML) is a C-based API for
+monitoring and managing various states of the NVIDIA GPU devices. It provides
+a direct access to the queries and commands exposed via @code{nvidia-smi}.")
+    (home-page "https://developer.nvidia.com/management-library-nvml")
+    (license (cuda-license name))))
+
+(define-public cuda-nvdisasm
+  (package
+   (name "cuda-nvdisasm")
+   (version "12.1.105")
+   (source
+    (origin
+     (method url-fetch)
+     (uri (cuda-module-url name version))
+     (sha256
+      (base32
+       (match (or (%current-target-system) (%current-system))
+              ("x86_64-linux"
+               "1sd9wqf5y4xvz70yh58mdxxddwnkyfjfaj6nrykpvqrry79vyz7l")
+              ("aarch64-linux"
+               "0pnk1x1c7msz93r5kgkb218akf02ymjar2dz8s3sx08hicaslff2")
+              ("powerpc64le-linux"
+               "04xjcjj055ffs58gkf86jzryyzxia8c995g8xpj5nf2zhaw030hw"))))))
+   (build-system cuda-build-system)
+   (arguments
+    (list #:install-plan ''(("bin" "bin"))))
+   (synopsis "Extract information from CUDA cubin files")
+   (description "This binary extracts information from standalone cubin files
+and presents them in human readable format.  The output of @code{nvdisasm}
+includes CUDA assembly code for each kernel, listing of ELF data sections and
+other CUDA specific sections.  Output style and options are controlled through
+nvdisasm command-line options.  @code{nvdisasm} also does control flow
+analysis to annotate jump/branch targets and makes the output easier to
+read.")
+   (home-page "https://docs.nvidia.com/cuda/\
+cuda-binary-utilities/index.html#nvdisasm")
+   (license (cuda-license name))))
+
+(define-public cuda-nvprof
+  (package
+   (name "cuda-nvprof")
+   (version "12.1.105")
+   (source
+    (origin
+     (method url-fetch)
+     (uri (cuda-module-url name version))
+     (sha256
+      (base32
+       (match (or (%current-target-system) (%current-system))
+              ("x86_64-linux"
+               "18z522w0rnrqbqymigsd88rscz29z9fg3bf5w6ri4yjr8a1ycdg9")
+              ("powerpc64le-linux"
+               "1sd9wbb2zdc29jx7m3m5qs29s67ww71g659228y2045nr340qjc4"))))))
+   (build-system cuda-build-system)
+   (arguments
+    (list
+     #:strip-binaries? #f  ; XXX: breaks 'validate-runpath phase
+     #:install-plan ''(("bin" "bin")
+                       ("lib" "lib")
+                       ("pkg-config" "share/pkg-config"))
+     #:patchelf-inputs
+     ''(("cuda-cudart" "/lib/stubs") "cuda-cupti" "gcc" "glibc")))
+   (inputs (list cuda-cudart cuda-cupti `(,gcc "lib") glibc))
+   (synopsis "Command-line NVIDIA GPU profiler")
+   (description
+    "This package provides a command-line tool to profile CUDA kernels.  It
+enables the collection of a timeline of CUDA-related activities on both CPU
+and GPU, including kernel execution, memory transfers, memory set and CUDA API
+calls and events or metrics for CUDA kernels.")
+   (home-page "https://developer.nvidia.com/cuda-toolkit")
+   (license (cuda-license name))))
+
+(define-public cuda-nvprune
+  (package
+   (name "cuda-nvprune")
+   (version "12.1.105")
+   (source
+    (origin
+     (method url-fetch)
+     (uri (cuda-module-url name version))
+     (sha256
+      (base32
+       (match (or (%current-target-system) (%current-system))
+              ("x86_64-linux"
+               "0qrisahad4n2g8n40i0gpq986ni8qjg53fd23vycmmmkggvb3wxa")
+              ("aarch64-linux"
+               "1hdih73ph80iwmjmz7dywz995626x64jkqfaybw7a908nxkjalpy")
+              ("powerpc64le-linux"
+               "0n92fcp5qms6dvg5hq1wl29wmh32wjfkykccjpqd8c40qrmd9ngh"))))))
+   (build-system cuda-build-system)
+   (arguments
+    (list #:install-plan ''(("bin" "bin"))))
+   (synopsis "Prune host NVIDIA binaries for the specified target")
+   (description
+    "This package provides a binary that prunes host object files and
+libraries to only contain device code for the specified targets.")
+   (home-page "https://docs.nvidia.com/cuda/\
+cuda-binary-utilities/index.html#nvprune")
+   (license (cuda-license name))))
+
+(define-public cuda-nvrtc
+  (package
+    (name "cuda-nvrtc")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0yriv3gcb4kpvpav3ilv8zyhravmz0blb0gv1c7pfq37r9m705dv")
+           ("aarch64-linux"
+            "0amp7qg64i6rfkqnjinizh9vhpajvqdpyan4jda9vqr7ckrdfq31")
+           ("powerpc64le-linux"
+            "10dwwhk2pfz6dcqpgjp2dryg5qb08ghnbxvbk4mfhvsajj9ik4wv"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "Runtime compilation library for CUDA C++")
+    (description
+     "This package accepts CUDA C++ source code in character string form and
+creates handles that can be used to obtain the CUDA PTX, for further
+instrumentation with the CUDA Toolkit.  It allows to shrink compilation
+overhead and simplify application deployment.")
+    (home-page "https://docs.nvidia.com/cuda/nvrtc/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-nvtx
+  (package
+    (name "cuda-nvtx")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1hpibjs9hpc1qhbxihgcpsf298cjwxh7qqsk0shhrwbv4hncg8lc")
+           ("aarch64-linux"
+            "1j841pl7n2waal2nclz076yxmzsibxssy8gnkb14yyc8sj657ajp")
+           ("powerpc64le-linux"
+            "1p0ml8p8dpzwp2kkgvv0yr4f61if33srpzbj1mjpzc70a0l55a31"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (synopsis "NVIDIA Tools Extension Library")
+    (description
+     "This package provides a cross-platform API for annotating source code to
+provide contextual information to developer tools.")
+    (home-page "https://docs.nvidia.com/nvtx/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-opencl
+  (package
+    (name "cuda-opencl")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1k4ab28kg5plr0nn83amr6j7cqg54vpis00am9dpiy4kgj2izgcx"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (synopsis "CUDA OpenCL API")
+    (description
+     "OpenCL (Open Computing Language) is a multi-vendor open standard for
+general-purpose parallel programming of heterogeneous systems that include
+CPUs, GPUs and other processors.  This package provides the API to use OpenCL
+on NVIDIA GPUs.")
+    (home-page "https://developer.nvidia.com/cuda-toolkit")
+    (license (cuda-license name))))
+
+(define-public cuda-profiler-api
+  (package
+   (name "cuda-profiler-api")
+   (version "12.1.105")
+   (source
+    (origin
+     (method url-fetch)
+     (uri (cuda-module-url name version))
+     (sha256
+      (base32
+       (match (or (%current-target-system) (%current-system))
+              ("x86_64-linux"
+               "187dngq2p66jz3yd5l6klqgcvjl6fkcjdjjz1dmzj10fxfv6rzrz")
+              ("aarch64-linux"
+               "1zq8qrh13ibm9c2km8lj4fmddc8smgh75ajpwb0l7rfg12dajnpr")
+              ("powerpc64le-linux"
+               "0mhk9cgac2jc4dmqic5ym34cwpz15b0qk824230bhgmwarjwzhiz"))))))
+   (build-system cuda-build-system)
+   (arguments
+     (list #:install-plan ''(("include" "include"))))
+   (synopsis "Low-level CUDA profiling API")
+   (description
+    "This package provides a minimal low-level profiling API for CUDA.")
+   (home-page "https://developer.nvidia.com/cuda-toolkit")
+   (license (cuda-license name))))
+
+(define-public cuda-sanitizer-api
+  (package
+    (name "cuda-sanitizer-api")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "00m6mw9vw8xgjbm8xzbpgirw8xcrdb13bgwkp4hxayy313d13afz")
+           ("aarch64-linux"
+            "01iv9qawabr2llq7nwcrpc1fb03yp9a311p08bafhbakk272nwwq")
+           ("powerpc64le-linux"
+            "1hp1kd7q5dj8adyv4haaz119qcmmc5gqs3g8zqik5rnmck6qk3p3"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan
+           ''(("compute-sanitizer" "compute-sanitizer")
+              ("bin" "bin"))))
+    (synopsis "Functional correctness checking suite for CUDA")
+    (description
+     "This package provides a functional correctness checking suite included in
+the CUDA toolkit.  This suite contains multiple tools that can perform
+different type of checks.  The @code{memcheck} tool is capable of precisely
+detecting and attributing out of bounds and misaligned memory access errors in
+CUDA applications, and can also report hardware exceptions encountered by the
+GPU.  The @code{racecheck} tool can report shared memory data access hazards
+that can cause data races.  The @code{initcheck} tool can report cases where
+the GPU performs uninitialized accesses to global memory.  The
+@code{synccheck} tool can report cases where the application is attempting
+invalid usages of synchronization primitives.")
+    (home-page "https://docs.nvidia.com/cuda/compute-sanitizer/index.html")
+    (license (cuda-license name))))
+
+(define-public libcublas
+  (package
+    (name "libcublas")
+    (version "12.1.3.1")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1323rg663fvjl73j5ny249ndnii2qbrfc7qccz5k6ky4v1x4s14h")
+           ("aarch64-linux"
+            "1bzzxzppz3ypx6q3gg7w6sfnwnypl974ppmbxh0j2jafvwy5nf9f")
+           ("powerpc64le-linux"
+            "1wgrgkn9mvh9k1d58ka92gbq11ckl8pyhz7za8lsrhjpw6c8iw15"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list
+      #:install-plan ''(("include" "include")
+                        ("lib" "lib")
+                        ("pkg-config" "share/pkg-config")
+                        ("src" "share/src"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis
+     "GPU-accelerated library for accelerating AI and HPC applications")
+    (description
+     "This package provides the NVIDIA cuBLAS library.  It includes several
+API extensions for providing drop-in industry standard BLAS APIs and GEMM APIs
+with support for fusions that are highly optimized for NVIDIA GPUs.  The
+cuBLAS library also contains extensions for batched operations, execution
+across multiple GPUs, and mixed- and low-precision execution with additional
+tuning for the best performance.")
+    (home-page "https://developer.nvidia.com/cublas")
+    (license (cuda-license name))))
+
+(define-public libcufft
+  (package
+    (name "libcufft")
+    (version "11.0.2.54")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "053vgq3lavrydna1gl7lry0lp78nby6iqh1gvclvq7vx5kac2dki")
+           ("aarch64-linux"
+            "0kmyxk9420vgm0ipr8a6fx1kcw19h8awy21l92lg4h7nzp58ig76")
+           ("powerpc64le-linux"
+            "02kklsdi43fvs2bi9s534rniqh43hqj9aq4i1m01yq6ya1cqqz1c"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "CUDA Fast Fourier Transform library")
+    (description
+     "This package provides cuFFT, the NVIDIA® CUDA® Fast Fourier Transform
+(FFT) product.  It consists of two separate libraries: cuFFT and cuFFTW.  The
+cuFFT library is designed to provide high performance on NVIDIA GPUs.  The
+cuFFTW library is provided as a porting tool to enable users of FFTW to start
+using NVIDIA GPUs with a minimum amount of effort.
+
+The FFT is a divide-and-conquer algorithm for efficiently computing discrete
+Fourier transforms of complex or real-valued data sets.  It is one of the most
+important and widely used numerical algorithms in computational physics and
+general signal processing.  The cuFFT library provides a simple interface for
+computing FFTs on an NVIDIA GPU, which allows users to quickly leverage the
+floating-point power and parallelism of the GPU in a highly optimized and
+tested FFT library.   The cuFFTW library provides the FFTW3 API to facilitate
+porting of existing FFTW applications.")
+    (home-page "https://docs.nvidia.com/cuda/cufft/index.html")
+    (license (cuda-license name))))
+
+(define-public libcurand
+  (package
+    (name "libcurand")
+    (version "10.3.2.106")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1pk4ngmqdhigg2889h3521kzxvvp3m1yxlnvf9hrwh9dmmpj2hcr")
+           ("aarch64-linux"
+            "0lw53j57g1094bzlx43dyq7iwwpljdkg17dnl8lk7n5vyrvjk4j3")
+           ("powerpc64le-linux"
+            "05r8fcam75m9zv853vl0zzp67jy0yacq09q8xx5ymxx7pcj58g7s"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "CUDA random number generation library")
+    (description
+     "This package provides facilities that focus on the simple and efficient
+generation of high-quality pseudorandom and quasirandom numbers.  A
+pseudorandom sequence of numbers satisfies most of the statistical properties
+of a truly random sequence but is generated by a deterministic algorithm.  A
+quasirandom sequence of -dimensional points is generated by a deterministic
+algorithm designed to fill an -dimensional space evenly.")
+    (home-page "https://docs.nvidia.com/cuda/curand/index.html")
+    (license (cuda-license name))))
+
+(define-public libcusolver
+  (package
+    (name "libcusolver")
+    (version "11.4.5.107")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1y34wk7xx9h0kj13rxb504yx5vchkapk1237ya7vs7z70409fsbi")
+           ("aarch64-linux"
+            "0wr8xa4hqay94gc1b9jzig24f7q3s2ykakppxv42pxp86dbjyp0q")
+           ("powerpc64le-linux"
+            "12jkky40g1xpjr1lkz925q93zbc84g559mhv94x70i4dmy6b4rj3"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list
+      #:install-plan ''(("include" "include")
+                        ("lib" "lib")
+                        ("pkg-config" "share/pkg-config"))
+      #:patchelf-inputs
+      ''("gcc" "glibc" "libcublas" "libcusparse" "libnvjitlink")))
+    (inputs (list `(,gcc "lib") glibc
+                  libcublas libcusparse libnvjitlink))
+    (outputs (list "out" "static"))
+    (synopsis
+     "GPU-accelerated library for decompositions and linear system solutions")
+    (description
+     "This package provides a high-level library based on the cuBLAS and
+cuSPARSE libraries.  It consists of two modules corresponding to two sets of
+API: the cuSolver API on a single GPU; and the cuSolverMG API on a single node
+multiGPU.  Each of these can be used independently or in concert with other
+toolkit libraries. The intent of cuSolver is to provide useful LAPACK-like
+features, such as common matrix factorization and triangular solve routines
+for dense matrices, a sparse least-squares solver and an eigenvalue solver.
+In addition, cuSolver provides a new refactorization library useful for
+solving sequences of matrices with a shared sparsity pattern.")
+    (home-page "https://docs.nvidia.com/cuda/cusolver/index.html")
+    (license (cuda-license name))))
+
+(define-public libcusparse
+  (package
+    (name "libcusparse")
+    (version "12.1.0.106")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "01rrz1wdsfmpz9wbvir7nwvlpdrqk6i1j987wdbb2lx7d96n07xf")
+           ("aarch64-linux"
+            "1vxmiw9qzg67sr4m9mpzhcy392z8vx2m09yl5h2bhb8kjxrdljik")
+           ("powerpc64le-linux"
+            "13ji6dlipzahlrri5sp00qyrfa3wgp9z5mv3075qksmnjhi7wxkv"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list
+      #:install-plan ''(("include" "include")
+                        ("lib" "lib")
+                        ("pkg-config" "share/pkg-config")
+                        ("src" "share/src"))
+      #:patchelf-inputs ''("gcc" "glibc" "libnvjitlink")))
+    (inputs (list `(,gcc "lib") glibc libnvjitlink))
+    (outputs (list "out" "static"))
+    (synopsis "CUDA sparse matrix library")
+    (description
+     "This package provides a set of GPU-accelerated basic linear algebra
+subroutines used for handling sparse matrices that perform significantly
+faster than CPU-only alternatives.  Depending on the specific operation, the
+library targets matrices with sparsity ratios in the range between 70%-99.9%.")
+    (home-page "https://docs.nvidia.com/cuda/cusparse/index.html")
+    (license (cuda-license name))))
+
+;; XXX: This library is introduced in a later version of cuda-toolkit.
+;; (define-public libnvfatbin
+;;   (package
+;;    (name "libnvfatbin")
+;;    (version "12.4.127")
+;;    (source
+;;     (origin
+;;      (method url-fetch)
+;;      (uri (cuda-module-url name version))
+;;      (sha256
+;;       (base32
+;;        (match (or (%current-target-system) (%current-system))
+;;               ("x86_64-linux"
+;;                "03mfxy8k07ks3srqmwwbhmr6961w0djsdgy0qdwaxl9favvgay0j")
+;;               ("aarch64-linux"
+;;                "0b6kamwgg424yibcb1f0pqmmd7jgxlnsxd37drj4fh7823glf4i7")
+;;               ("powerpc64le-linux"
+;;                "1jg4z8h2wrldxb1cfzbrw69sjw4h2hxja82jqkxp19aacbdcs7h7"))))))
+;;    (build-system cuda-build-system)
+;;    (outputs (list "out" "static"))
+;;    (synopsis "Combine multiple CUDA objects into one CUDA fatbin")
+;;    (description
+;;     "This package provides a set of APIs which can be used at runtime to
+;; combine multiple CUDA objects into one CUDA fat binary (fatbin).  The APIs
+;; accept inputs in multiple formats, either device cubins, PTX, or LTO-IR.  The
+;; output is a fatbin that can be loaded by @code{cuModuleLoadData} of the CUDA
+;; Driver API.  The functionality in this library is similar to the
+;; @code{fatbinary} offline tool in the CUDA toolkit, with the following
+;; advantages:
+;; @itemize
+;; @item Support for runtime fatbin creation.
+;; @item The clients get fine grain control over the input process.
+;; @item Supports direct input from memory, rather than requiring inputs be
+;; written to files.
+;; @end itemize")
+;;    (home-page "https://docs.nvidia.com/cuda/nvfatbin/index.html")
+;;    (license (cuda-license name))))
+
+(define-public libnvjitlink
+  (package
+    (name "libnvjitlink")
+    (version "12.1.105")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1d5ngmf10l37rm7814jlghgfpa0xjyqiis8vqg0y22cmrw365vi1")
+           ("aarch64-linux"
+            "15fbd3ygk41wbsjyzsharncd94pzn0ikwhq5fq5x7lyh9g0frkfz")
+           ("powerpc64le-linux"
+            "1gq93cp68x0nivajz9bh7mvykfzcfhim5l907lg1kp2jb3rnrssg"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("lib" "lib")
+                             ("pkg-config" "share/pkg-config")
+                             ("include" "include"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "Link GPU devide code at runtime")
+    (description
+     "This package provides a set of APIs which can be used at runtime to link
+together GPU devide code.  It supports Link Time Optimization.")
+    (home-page "https://docs.nvidia.com/cuda/nvjitlink/index.html")
+    (license (cuda-license name))))
+
+(define-public libnvjpeg
+  (package
+    (name "libnvjpeg")
+    (version "12.2.0.2")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "0xbzbhf7s7gsilr7gx4r7g2j1sxj977wr5zf7jjqg31ch9x2d4yj")
+           ("powerpc64le-linux"
+            "1z90kf95045s6q44rm2da3g31icb3hyh3jmv9a5s5bvx6flfs4lk"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis "GPU-accelerated JPEG codec library")
+    (description
+     "This package provides a high-performance, GPU accelerated JPEG decoding
+functionality for image formats commonly used in deep learning and hyperscale
+multimedia applications.  The library offers single and batched JPEG decoding
+capabilities which efficiently utilize the available GPU resources for optimum
+performance; and the flexibility for users to manage the memory allocation
+needed for decoding.
+
+The nvJPEG library enables the following functions: use the JPEG image data
+stream as input; retrieve the width and height of the image from the data
+stream, and use this retrieved information to manage the GPU memory allocation
+and the decoding.  A dedicated API is provided for retrieving the image
+information from the raw JPEG image data stream.
+
+The encoding functions of the nvJPEG library perform GPU-accelerated
+compression of user’s image data to the JPEG bitstream.  User can provide input
+data in a number of formats and colorspaces, and control the encoding process
+with parameters.  Encoding functionality will allocate temporary buffers using
+user-provided memory allocator.")
+    (home-page "https://docs.nvidia.com/cuda/nvjpeg/index.html")
+    (license (cuda-license name))))
+
+(define-public libnpp
+  (package
+    (name "libnpp")
+    (version "12.1.0.40")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (cuda-module-url name version))
+       (sha256
+        (base32
+         (match (or (%current-target-system) (%current-system))
+           ("x86_64-linux"
+            "1lcb8hdqv2h3i33iinfj6nljh6bhlvy4c3pgis5wy7lnqwr2xi2j")
+           ("aarch64-linux"
+            "048blkq0qibj54a70pwn49w4y525if35djkfqx7l7p7ibm47qx3h")
+           ("powerpc64le-linux"
+            "140w44a5q5pcfzkn0dl5ibkhshd3pb7jczgddpklqv2a5pkngd2y"))))))
+    (build-system cuda-build-system)
+    (arguments
+     (list #:install-plan ''(("include" "include")
+                             ("lib" "lib")
+                             ("pkg-config" "share/pkg-config"))))
+    (inputs (list `(,gcc "lib") glibc))
+    (outputs (list "out" "static"))
+    (synopsis
+     "NVIDIA 2D Image and Signal Processing Performance Primitives")
+    (description
+     "This package provides a library of functions for performing CUDA
+accelerated 2D image and signal processing.
+
+The primary library focuses on image processing and is widely applicable for
+developers in these areas.  NPP will evolve over time to encompass more of the
+compute heavy tasks in a variety of problem domains.  The NPP library is
+written to maximize flexibility, while maintaining high performance.")
+    (home-page "https://docs.nvidia.com/cuda/npp/index.html")
+    (license (cuda-license name))))
+
+(define-public cuda-toolkit
+  (package
+    (name "cuda-toolkit")
+    (version "12.1.1")
+    (source #f)
+    (build-system trivial-build-system)
+    (arguments
+     '(#:modules ((guix build union))
+       #:builder
+       (begin
+         (use-modules (ice-9 match)
+                      (guix build union))
+         (match %build-inputs
+           (((names . directories) ...)
+            (union-build (assoc-ref %outputs "out")
+                         directories))))))
+    (inputs
+     (list cuda-cccl
+           ;; FIXME: cuda-compat is only used for aarch64 for this version
+           cuda-cudart
+           cuda-nvcc
+           cuda-nvml-dev
+           cuda-nvtx
+           cuda-nvrtc
+           libcublas
+           ;; libcudla seems very specialized for now
+           libcufft
+           libcurand
+           libcusolver
+           libcusparse
+           libnpp
+           ;; libnvfatbin is introduced in a later version
+           ;; libnvidia-nscq seems very specialized for now
+           libnvjitlink
+           libnvjpeg
+           libnvvm))
+           ;; TODO Add nsight suite, probably in a new metapackage.
+    (synopsis "Metapackage for CUDA")
+    (description
+     "This package provides the CUDA compiler and the CUDA run-time support
+libraries for NVIDIA GPUs, all of which are proprietary.")
+    (home-page "https://developer.nvidia.com/cuda-toolkit")
+    (license (package-license cuda-cudart))))
+
+(define-public cuda-dev
+  (package
+    (name "cuda-dev")
+    (version "12.1.1")
+    (source #f)
+    (build-system trivial-build-system)
+    (arguments
+     '(#:modules ((guix build union))
+       #:builder
+       (begin
+         (use-modules (ice-9 match)
+                      (guix build union))
+         (match %build-inputs
+           (((names . directories) ...)
+            (union-build (assoc-ref %outputs "out")
+                         directories))))))
+    (inputs
+     (list cuda-toolkit
+           cuda-cuobjdump
+           cuda-cupti
+           cuda-cuxxfilt
+           cuda-gdb
+           cuda-nvdisasm
+           cuda-nvprof
+           cuda-nvprune
+           ;; cuda-nvvp will be deprecated soon
+           cuda-profiler-api
+           ;; fabricmanager seems very specialized
+           ;; imex is poorly documented
+           cuda-sanitizer-api))
+    (synopsis "Metapackage for CUDA development")
+    (description
+     "This package provides the CUDA compiler and the CUDA run-time support
+libraries for NVIDIA GPUs, all of which are proprietary.")
+    (home-page "https://developer.nvidia.com/cuda-toolkit")
+    (license (package-license cuda-cudart))))
+
+(define-public cuda-python
+  (package
+    (name "cuda-python")
+    (version "12.1.0")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/NVIDIA/cuda-python")
+             (commit (string-append "v" version))))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32 "0i0wvx5kxckphsf1n02rr86hrnc2r6p8wlrvq1n1w9c3l6m24d13"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:tests? #f  ; FIXME: most tests fail.
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-after 'unpack 'fix-setup.py
+            (lambda _
+              (substitute* "setup.py"
+                (("import versioneer" all)
+                 (format #f "~a~%import pyparsing" all)))))
+          (add-before 'build 'set_cuda_paths
+            (lambda _
+              (setenv "CUDA_HOME"
+                      #$(this-package-input "cuda-dev"))
+              (setenv "PARALLEL_LEVEL"
+                      (number->string (parallel-job-count))))))))
+    (native-inputs (list python-cython
+                         python-numpy
+                         python-pytest
+                         python-pytest-benchmark
+                         python-setuptools
+                         python-wheel))
+    (inputs (list cuda-dev))
+    (propagated-inputs (list python-pyclibrary))
+    (home-page "https://github.com/NVIDIA/cuda-python")
+    (synopsis "CUDA Python low-level bindings")
+    (description "This package provides Python low-level bindings for NVIDIA
+CUDA toolkit.")
+    (license
+     (license:nonfree
+      "https://github.com/NVIDIA/cuda-python/blob/main/LICENSE"))))
+
+(define (nvidia-cudnn-samples system version)
+  (origin
+    (method url-fetch)
+    (uri
+     (format #f
+             "https://developer.download.nvidia.com/compute/cudnn/redist\
+/cudnn_samples/~a/cudnn_samples-~a-~a_cuda12-archive.tar.xz"
+             system
+             system
+             version))
+    (sha256
+     (base32 "01drxcyj8r4zsrc7i9cwczd185dcacxgwllipf9w612byzrs9afk"))))
+
+(define-public nvidia-cudnn
+  (package
+    (name "nvidia-cudnn")
+    (version "8.9.7.29")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (let ((system (cuda-current-system)))
+              (format #f
+                      "https://developer.download.nvidia.com/compute/cudnn/redist\
+/cudnn/~a/cudnn-~a-~a_cuda12-archive.tar.xz"
+                      system
+                      system
+                      version)))
+       (sha256
+        (base32 "1fz345pgngn1v4f0i80s7g4k0vhhd98ggcm07jpsfhkybii36ls7"))))
+    (build-system cuda-build-system)
+    (arguments
+     (list
+      #:install-plan ''(("include" "include")
+                        ("lib" "lib"))
+      #:patchelf-inputs ''("gcc" "glibc" "out" "zlib")
+      #:modules '((nonguix build cuda-build-system)
+                  ((guix build gnu-build-system) #:prefix gnu:)
+                  (guix build union)
+                  (guix build utils)
+                  (ice-9 ftw))
+      #:imported-modules `(,@%cuda-build-system-modules
+                           (guix build gnu-build-system)
+                           (guix build union))
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-after 'install 'prepare-tests
+            (lambda* (#:key outputs #:allow-other-keys)
+              (mkdir "tests")
+              (with-directory-excursion "tests"
+                ((assoc-ref gnu:%standard-phases 'unpack)
+                 #:source #$(nvidia-cudnn-samples
+                             (cuda-current-system)
+                             (package-version this-package))))
+              (chdir "tests")
+              (chdir (caddr (scandir ".")))
+              (union-build
+               "cuda+cudnn"
+               (list (assoc-ref outputs "out")
+                     '#$(this-package-native-input "cuda-toolkit")))
+              (setenv "CUDA_PATH" (canonicalize-path "cuda+cudnn"))
+              (chdir "src/cudnn_samples_v8")))
+          (add-after 'prepare-tests 'check
+            (lambda _
+              (for-each
+               (lambda (dir)
+                 (format #t "Building ~a...~%" dir)
+                 (with-directory-excursion dir
+                   (assoc-ref gnu:%standard-phases 'build)))
+               (cdr (find-files "." (lambda (file stat)
+                                      (eq? 'directory (stat:type stat)))
+                                #:directories? #t))))))))
+    (native-inputs (list cuda-toolkit))
+    (inputs (list `(,gcc "lib") glibc zlib))
+    (outputs (list "out" "static"))
+    (synopsis "NVIDIA CUDA Deep Neural Network library (cuDNN)")
+    (description
+     "This package provides a GPU-accelerated library of primitives for deep
+neural networks, with highly tuned implementations for standard routines such
+as forward and backward convolution, attention, matmul, pooling, and
+normalization.")
+    (home-page "https://developer.nvidia.com/cudnn")
+    (license
+     (license:nonfree "https://developer.download.nvidia.com/\
+compute/cudnn/redist/cudnn/LICENSE.txt"))))
+
+(define-public nvidia-cudnn-frontend
+  (package
+    (name "nvidia-cudnn-frontend")
+    (version "1.5.2")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/NVIDIA/cudnn-frontend")
+             (commit (string-append "v" version))))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32 "04aglaxh4mgm94qwia293gqn7gmlw5w6mk8nky4k6l1m2615swyd"))
+       (modules '((guix build utils)))
+       (snippet
+        #~(begin
+            (delete-file-recursively "include/cudnn_frontend/thirdparty")
+            (substitute* (find-files "include" "\\.(cpp|h|hpp)")
+              (("\"cudnn_frontend/thirdparty/nlohmann/json\\.hpp\"")
+               "<nlohmann/json.hpp>"))))
+       (patches
+        (parameterize
+            ((%patch-path
+              (map
+               (lambda (directory)
+                 (string-append directory "/nongnu/packages/patches"))
+               %load-path)))
+          (search-patches "nvidia-cudnn-frontend_find_built_dlpack.patch"
+                          "nvidia-cudnn-frontend_find_nlohmann_json.patch"
+                          "nvidia-cudnn-frontend_use_store_so.patch")))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:modules '((guix build pyproject-build-system)
+                  (guix build union)
+                  (guix build utils))
+      #:imported-modules `(,@%pyproject-build-system-modules
+                           (guix build union))
+      #:phases
+      #~(modify-phases %standard-phases
+          (add-before 'build 'set_cuda_paths
+            (lambda _
+              (substitute* "python/cudnn/__init__.py"
+                (("@store-cudnn\\.so-path@")
+                 (format #f "\"~a/lib/libcudnn.so\""
+                         #$(this-package-input "nvidia-cudnn"))))
+              (setenv "CUDA_PATH"
+                      #$(this-package-input "cuda-toolkit"))
+              (setenv "CUDNN_PATH"
+                      #$(this-package-input "nvidia-cudnn"))
+              (setenv "CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE" "0")
+              (setenv "CMAKE_BUILD_PARALLEL_LEVEL"
+                      (number->string (parallel-job-count)))))
+          (add-after 'install 'post-install
+            (lambda _
+              (union-build
+               (string-append #$output "/include")
+               (find-files
+                (string-append #$output "/lib")
+                (lambda (file stat)
+                  (string-suffix? "include" file))
+                #:directories? #t)))))))
+    (native-inputs (list cmake dlpack pybind11 python-setuptools python-wheel))
+    (inputs (list cuda-toolkit nlohmann-json nvidia-cudnn))
+    (home-page "https://github.com/NVIDIA/cudnn-frontend")
+    (synopsis "cuDNN API header-only library")
+    (description "This package provides a C++ header-only library that wraps
+the NVIDIA CUDA Deep Neural Network library (cuDNN) C backend API.  This entry
+point to the same API is less verbose (without loss of control), and adds
+functionality on top of the backend API, such as errata filters and
+autotuning.")
+    (license license-gnu:expat)))
+
+(define-public nvidia-cutlass
+  (package
+    (name "nvidia-cutlass")
+    (version "3.2.2")
+    (source
+     (origin
+       (method git-fetch)
+       (uri (git-reference
+             (url "https://github.com/NVIDIA/cutlass")
+             (commit (string-append "v" version))))
+       (file-name (git-file-name name version))
+       (sha256
+        (base32 "0qyxkp3pmndlzm3aw9xwrx57znj9p4xlvqahavgzq8c1nd7bj3wp"))
+       (patches
+        (parameterize
+            ((%patch-path
+              (map
+               (lambda (directory)
+                 (string-append directory "/nongnu/packages/patches"))
+               %load-path)))
+          (search-patches "nvidia-cutlass-3.2.2_disable_static_lib.patch")))))
+    (build-system cmake-build-system)
+    (arguments
+     (list
+      ;; XXX: Cutlass is incredibly heavy to build when not specifying target
+      ;; GPU architecture (4G), avoid tests, examples and static library.
+      ;; Setting the contrary often runs out of RAM even on beefy laptops.
+      #:configure-flags ''("-DCUTLASS_ENABLE_TESTS=OFF"
+                           "-DCUTLASS_INSTALL_TESTS=OFF"
+                           "-DCUTLASS_BUILD_STATIC_LIBRARY=OFF"
+                           "-DCUTLASS_ENABLE_EXAMPLES=OFF"
+                           "-DCUTLASS_UNITY_BUILD_ENABLED=ON")
+      #:phases
+      #~(modify-phases %standard-phases
+          ;; XXX: This phase is not necessary on earlier versions.
+          ;; Remove it when updating.
+          (add-after 'unpack 'fix-cuda-build
+            (lambda _
+              (substitute* "CMakeLists.txt"
+                (("--user")
+                 (string-append "--prefix=" #$output)))
+              (setenv "PYTHONPATH"
+                      (string-append (getcwd) "/python"))))
+          (add-before 'build 'set_cuda_paths
+            (lambda _
+              (setenv "CUDACXX"
+                      #$(file-append (this-package-input "cuda-toolkit")
+                                     "/bin/nvcc"))))
+          (add-after 'install 'cleanup
+            (lambda _
+              (delete-file-recursively
+               (string-append #$output "/test")))))))
+    (native-inputs (list python python-setuptools))
+    (inputs (list cuda-toolkit))
+    (propagated-inputs (list cuda-python
+                             python-networkx
+                             python-numpy
+                             python-pydot
+                             python-scipy
+                             python-treelib))
+    (home-page "https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda")
+    (synopsis "CUDA Templates for Linear Algebra Subroutines")
+    (description
+     "This package provides a collection of CUDA C++ template abstractions for
+implementing high-performance matrix-matrix multiplication (GEMM) and related
+computations at all levels and scales within CUDA.  It incorporates strategies
+for hierarchical decomposition and data movement similar to those used to
+implement cuBLAS and cuDNN.  CUTLASS decomposes these moving parts into
+reusable, modular software components abstracted by C++ template
+classes.  Primitives for different levels of a conceptual parallelization
+hierarchy can be specialized and tuned via custom tiling sizes, data types,
+and other algorithmic policy.  The resulting flexibility simplifies their use
+as building blocks within custom kernels and applications.")
+    (license
+     (license:nonfree
+      "https://github.com/NVIDIA/cutlass/blob/main/LICENSE.txt"))))
+
+
 
 ;;;
 ;;; Other packages
@@ -1003,6 +2322,80 @@ laptops.")
 nvidia-smi.")
     (license license-gnu:bsd-3)))
 
+(define nvidia-nccl-tests
+  (let* ((name "nvidia-nccl-tests")
+         (revision "0")
+         ;; Commit at the date of the version of nvidia-nccl
+         (commit "e98ef24bc03bef33054c3bc690ce622576c803b6")
+         (version (git-version "2.18.1" revision commit)))
+    (origin
+      (method git-fetch)
+      (uri (git-reference
+            (url "https://github.com/nvidia/nccl-tests")
+            (commit commit)))
+      (file-name (git-file-name name version))
+      (sha256
+       (base32 "07z26jivpc7iwx8dirs520g6db3b3r0rckqq1g47242f312f5h1s")))))
+
+(define-public nvidia-nccl
+  (package
+    (name "nvidia-nccl")
+    (version "2.18.1")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference
+                    (url "https://github.com/NVIDIA/nccl")
+                    (commit (string-append "v" version "-1"))))
+              (file-name (git-file-name name version))
+              (sha256
+               (base32 "10w5gkfac5jdi2dlavvlb7v6fq1cz08bs943kjvqy0sa2kjcwbk6"))))
+    (build-system gnu-build-system)
+    (arguments
+     (list #:modules '((guix build gnu-build-system)
+                       (guix build utils)
+                       (nonguix build utils))
+           #:imported-modules `(,@%default-gnu-imported-modules
+                                (guix build utils)
+                                (nonguix build utils))
+           #:test-target "all"
+           #:phases
+           #~(modify-phases %standard-phases
+               (replace 'configure
+                 (lambda _
+                   (setenv "CUDA_HOME"
+                           #$(this-package-input "cuda-toolkit"))
+                   (setenv "PREFIX" #$output)
+                   (substitute* "src/Makefile"
+                     (("\\$\\(PREFIX\\)/lib/pkgconfig")
+                      "$(PREFIX)/share/pkg-config"))))
+               (add-after 'install 'install-static install-static-output)
+               (add-after 'build 'prepare-tests
+                 (lambda* (#:key outputs #:allow-other-keys)
+                   (mkdir "tests")
+                   (with-directory-excursion "tests"
+                     ((assoc-ref %standard-phases 'unpack)
+                      #:source #$nvidia-nccl-tests))
+                   (setenv "NCCL_HOME" (canonicalize-path "build"))
+                   (chdir "tests/source")))
+               (add-after 'check 'step-out-of-tests
+                 (lambda _
+                   (chdir "../.."))))))
+    (native-inputs (list which))
+    (inputs (list cuda-toolkit))
+    (outputs (list "out" "static"))
+    (home-page "https://developer.nvidia.com/nccl")
+    (synopsis "NVIDIA Collective Communications Library (NCCL)")
+    (description "The NVIDIA Collective Communication Library (NCCL)
+implements multi-GPU and multi-node communication primitives optimized for
+NVIDIA GPUs and Networking.  NCCL provides routines such as all-gather,
+all-reduce, broadcast, reduce, reduce-scatter as well as point-to-point send
+and receive that are optimized to achieve high bandwidth and low latency over
+PCIe and NVLink high-speed interconnects within a node and over NVIDIA
+Mellanox Network across nodes.")
+    (license
+     (license:nonfree
+      "https://github.com/NVIDIA/nccl/blob/master/LICENSE.txt"))))
+
 (define-public nvidia-nvml
   (package
     (name "nvidia-nvml")
diff --git a/nongnu/packages/patches/nvidia-cudnn-frontend_find_built_dlpack.patch b/nongnu/packages/patches/nvidia-cudnn-frontend_find_built_dlpack.patch
new file mode 100644
index 0000000..2aaf0ec
--- /dev/null
+++ b/nongnu/packages/patches/nvidia-cudnn-frontend_find_built_dlpack.patch
@@ -0,0 +1,43 @@
+From 1b73d8d74b3ec7949e21d926d28385543c202dc7 Mon Sep 17 00:00:00 2001
+From: Nicolas Graves <ngraves@ngraves.fr>
+Date: Thu, 25 Jul 2024 14:33:24 +0200
+Subject: [PATCH] Find dlpack package instead of building it.
+
+---
+ python/CMakeLists.txt | 13 +++----------
+ 1 file changed, 3 insertions(+), 10 deletions(-)
+
+diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
+index cdfbf55..4168411 100644
+--- a/python/CMakeLists.txt
++++ b/python/CMakeLists.txt
+@@ -2,15 +2,8 @@ cmake_minimum_required(VERSION 3.18)
+ 
+ Include(FetchContent)
+ 
+-# Fetch and build dlpack
+-set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+-set(BUILD_MOCK OFF)
+-FetchContent_Declare(
+-  dlpack
+-  GIT_REPOSITORY https://github.com/dmlc/dlpack
+-  GIT_TAG        v0.8
+-)
+-FetchContent_MakeAvailable(dlpack)
++# Find dlpack
++find_package(dlpack CONFIG REQUIRED)
+ 
+ # Find python
+ find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+@@ -60,7 +53,7 @@ target_compile_definitions(_compiled_module PRIVATE NV_CUDNN_FRONTEND_USE_DYNAMI
+ target_link_libraries(
+     _compiled_module
+     
+-    PRIVATE dlpack
++    PRIVATE dlpack::dlpack
+ )
+ 
+ set_target_properties(
+-- 
+2.45.2
+
diff --git a/nongnu/packages/patches/nvidia-cudnn-frontend_find_nlohmann_json.patch b/nongnu/packages/patches/nvidia-cudnn-frontend_find_nlohmann_json.patch
new file mode 100644
index 0000000..cd8e664
--- /dev/null
+++ b/nongnu/packages/patches/nvidia-cudnn-frontend_find_nlohmann_json.patch
@@ -0,0 +1,36 @@
+From 3f7a23cc5a84af36442c4035db78e616d884b540 Mon Sep 17 00:00:00 2001
+From: Nicolas Graves <ngraves@ngraves.fr>
+Date: Thu, 25 Jul 2024 16:43:12 +0200
+Subject: [PATCH] Find unbundled nlohmann-json package.
+
+---
+ CMakeLists.txt | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index adf22fc..8211fcd 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -16,6 +16,11 @@ endif()
+ 
+ add_library(cudnn_frontend INTERFACE)
+ 
++# Find the nlohmann_json package
++if(NOT CUDNN_FRONTEND_SKIP_NLOHMANN_JSON)
++    find_package(nlohmann_json CONFIG REQUIRED)
++endif()
++
+ target_compile_definitions(
+     cudnn_frontend INTERFACE
+     $<$<BOOL:${CUDNN_FRONTEND_SKIP_JSON_LIB}>:CUDNN_FRONTEND_SKIP_JSON_LIB>
+@@ -25,6 +30,7 @@ target_include_directories(
+     cudnn_frontend INTERFACE
+     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
++    $<$<NOT:$<BOOL:${CUDNN_FRONTEND_SKIP_NLOHMANN_JSON}>>:$<TARGET_PROPERTY:nlohmann_json::nlohmann_json,INTERFACE_INCLUDE_DIRECTORIES>>
+ )
+ 
+ # Find the cuda compiler
+-- 
+2.45.2
+
diff --git a/nongnu/packages/patches/nvidia-cudnn-frontend_use_store_so.patch b/nongnu/packages/patches/nvidia-cudnn-frontend_use_store_so.patch
new file mode 100644
index 0000000..acceced
--- /dev/null
+++ b/nongnu/packages/patches/nvidia-cudnn-frontend_use_store_so.patch
@@ -0,0 +1,39 @@
+From 0c16ed53cae242b02069a1f6fed463dc819526e0 Mon Sep 17 00:00:00 2001
+From: Nicolas Graves <ngraves@ngraves.fr>
+Date: Thu, 25 Jul 2024 14:58:42 +0200
+Subject: [PATCH] Use absolute store cudnn.so path.
+
+---
+ python/cudnn/__init__.py | 16 +---------------
+ 1 file changed, 1 insertion(+), 15 deletions(-)
+
+diff --git a/python/cudnn/__init__.py b/python/cudnn/__init__.py
+index 35eb883..39dc047 100644
+--- a/python/cudnn/__init__.py
++++ b/python/cudnn/__init__.py
+@@ -137,21 +137,7 @@ pygraph.execute_plan_at_index = _execute_plan_at_index
+ 
+ 
+ def _dlopen_cudnn():
+-    # First look at python site packages
+-    lib_path = glob.glob(
+-        os.path.join(
+-            sysconfig.get_path("purelib"), "nvidia/cudnn/lib/libcudnn.so.*[0-9]"
+-        )
+-    )
+-
+-    if lib_path:
+-        assert (
+-            len(lib_path) == 1
+-        ), f"Found {len(lib_path)} libcudnn.so.x in nvidia-cudnn-cuXX."
+-        lib = ctypes.CDLL(lib_path[0])
+-    else:  # Fallback
+-        lib = ctypes.CDLL("libcudnn.so")
+-
++    lib = ctypes.CDLL(@store-cudnn.so-path@)
+     handle = ctypes.cast(lib._handle, ctypes.c_void_p).value
+     _compiled_module._set_dlhandle_cudnn(handle)
+ 
+-- 
+2.45.2
+
diff --git a/nongnu/packages/patches/nvidia-cutlass-3.2.2_disable_static_lib.patch b/nongnu/packages/patches/nvidia-cutlass-3.2.2_disable_static_lib.patch
new file mode 100644
index 0000000..8c693d6
--- /dev/null
+++ b/nongnu/packages/patches/nvidia-cutlass-3.2.2_disable_static_lib.patch
@@ -0,0 +1,73 @@
+From 7ee9ec4c2636cca833761d3466df27edc4e3f952 Mon Sep 17 00:00:00 2001
+From: Nicolas Graves <ngraves@ngraves.fr>
+Date: Tue, 30 Jul 2024 14:13:09 +0200
+Subject: [PATCH] Add CUTLASS_BUILD_STATIC_LIBRARY option
+
+---
+ tools/library/CMakeLists.txt | 22 ++++++++++++++++++----
+ 1 file changed, 18 insertions(+), 4 deletions(-)
+
+diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
+index a11ebcf6..79f7ccd1 100644
+--- a/tools/library/CMakeLists.txt
++++ b/tools/library/CMakeLists.txt
+@@ -34,6 +34,7 @@ include(GNUInstallDirs)
+ 
+ set(CUTLASS_BUILD_MONO_LIBRARY OFF CACHE BOOL 
+   "Determines whether the cutlass library is generated as a single file or multiple files.")
++option(CUTLASS_BUILD_STATIC_LIBRARY "Build static libary for CUTLASS" ON)
+ 
+ ################################################################################
+ 
+@@ -126,7 +127,9 @@ function(cutlass_add_cutlass_library)
+     # simply link the generated object files to the default library. 
+ 
+     target_link_libraries(${DEFAULT_NAME} PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+-    target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
++    if (CUTLASS_BUILD_STATIC_LIBRARY)
++        target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
++    endif()
+ 
+   else()
+ 
+@@ -152,7 +155,7 @@ function(cutlass_add_cutlass_library)
+       )
+     
+     set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+-    
++    if (CUTLASS_BUILD_STATIC_LIBRARY)
+     cutlass_add_library(
+       ${__NAME}_static
+       STATIC
+@@ -189,6 +192,15 @@ function(cutlass_add_cutlass_library)
+       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+       )
++    else()
++    install(
++      TARGETS ${__NAME}
++      EXPORT NvidiaCutlass
++      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
++      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
++      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
++      )
++    endif()
+     
+     if (__SUFFIX)
+     
+@@ -197,8 +209,10 @@ function(cutlass_add_cutlass_library)
+       # commands to pull in all kernels by default.
+     
+       target_link_libraries(${DEFAULT_NAME} INTERFACE ${__NAME})
+-      target_link_libraries(${DEFAULT_NAME}_static INTERFACE ${__NAME}_static)
+-    
++      if (CUTLASS_BUILD_STATIC_LIBRARY)
++         target_link_libraries(${DEFAULT_NAME}_static INTERFACE ${__NAME}_static)
++      endif()
++
+     endif()
+ 
+   endif()
+-- 
+2.45.2
+
diff --git a/nongnu/packages/patches/nvidia-cutlass-3.4.0_disable_static_lib.patch b/nongnu/packages/patches/nvidia-cutlass-3.4.0_disable_static_lib.patch
new file mode 100644
index 0000000..8fab0c1
--- /dev/null
+++ b/nongnu/packages/patches/nvidia-cutlass-3.4.0_disable_static_lib.patch
@@ -0,0 +1,82 @@
+From ce4a14ae4041d6cfb69987fef5a65c50754c89b6 Mon Sep 17 00:00:00 2001
+From: Nicolas Graves <ngraves@ngraves.fr>
+Date: Sun, 28 Jul 2024 16:57:16 +0200
+Subject: [PATCH] Add option CUTLASS_BUILD_STATIC_LIBRARY
+
+---
+ tools/library/CMakeLists.txt | 26 +++++++++++++++++++++-----
+ 1 file changed, 20 insertions(+), 4 deletions(-)
+
+diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
+index 60a6cca5..f096c84d 100644
+--- a/tools/library/CMakeLists.txt
++++ b/tools/library/CMakeLists.txt
+@@ -34,6 +34,7 @@ include(GNUInstallDirs)
+ 
+ set(CUTLASS_BUILD_MONO_LIBRARY OFF CACHE BOOL 
+   "Determines whether the cutlass library is generated as a single file or multiple files.")
++option(CUTLASS_BUILD_STATIC_LIBRARY "Build static libary for CUTLASS" ON)
+ 
+ ################################################################################
+ 
+@@ -126,7 +127,9 @@ function(cutlass_add_cutlass_library)
+     # simply link the generated object files to the default library. 
+ 
+     target_link_libraries(${DEFAULT_NAME} PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
+-    target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
++    if (CUTLASS_BUILD_STATIC_LIBRARY)
++        target_link_libraries(${DEFAULT_NAME}_static PRIVATE $<BUILD_INTERFACE:${__NAME}_objs>)
++    endif()
+ 
+   else()
+ 
+@@ -154,7 +157,7 @@ function(cutlass_add_cutlass_library)
+       )
+     
+     set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}")
+-    
++    if (CUTLASS_BUILD_STATIC_LIBRARY)
+     cutlass_add_library(
+       ${__NAME}_static
+       STATIC
+@@ -193,6 +196,15 @@ function(cutlass_add_cutlass_library)
+       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+       ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+       )
++    else()
++    install(
++      TARGETS ${__NAME}
++      EXPORT NvidiaCutlass
++      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
++      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
++      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
++      )
++    endif()
+     
+     if (__SUFFIX)
+     
+@@ -201,7 +213,9 @@ function(cutlass_add_cutlass_library)
+       # commands to pull in all kernels by default.
+     
+       target_link_libraries(${DEFAULT_NAME} PUBLIC ${__NAME})
+-      target_link_libraries(${DEFAULT_NAME}_static PUBLIC ${__NAME}_static)
++      if (CUTLASS_BUILD_STATIC_LIBRARY)
++         target_link_libraries(${DEFAULT_NAME}_static PUBLIC ${__NAME}_static)
++      endif()
+     
+     endif()
+ 
+@@ -250,7 +264,9 @@ cutlass_add_cutlass_library(
+ 
+ # For backward compatibility with the old name
+ add_library(cutlass_lib ALIAS cutlass_library)
+-add_library(cutlass_lib_static ALIAS cutlass_library_static)
++if (CUTLASS_BUILD_STATIC_LIBRARY)
++   add_library(cutlass_lib_static ALIAS cutlass_library_static)
++endif()
+ 
+ ################################################################################
+ 
+-- 
+2.45.2
+
diff --git a/nonguix/build-system/cuda.scm b/nonguix/build-system/cuda.scm
new file mode 100644
index 0000000..de6dac5
--- /dev/null
+++ b/nonguix/build-system/cuda.scm
@@ -0,0 +1,181 @@
+;;; SPDX-License-Identifier: GPL-3.0-or-later
+;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>
+
+(define-module (nonguix build-system cuda)
+  #:use-module (gnu packages gcc)
+  #:use-module (guix store)
+  #:use-module (guix utils)
+  #:use-module (guix gexp)
+  #:use-module (guix monads)
+  #:use-module (guix derivations)
+  #:use-module (guix search-paths)
+  #:use-module (guix build-system)
+  #:use-module (guix build-system gnu)
+  #:use-module (guix packages)
+  #:use-module (ice-9 match)
+  #:use-module (srfi srfi-1)
+  #:use-module (nonguix build-system binary)
+  #:use-module (nonguix utils)
+  #:use-module ((nonguix licenses) #:prefix license:)
+  #:export (cuda-license
+            cuda-current-system
+            cuda-module-url
+            guix-system->cuda-system
+
+            %cuda-build-system-modules
+            lower
+            cuda-build
+            cuda-build-system))
+
+;; Commentary:
+;;
+;; Standard build procedure for Cuda binary packages.  This is
+;; implemented as an extension of `binary-build-system'.
+;;
+;; Code:
+
+(define %cuda-build-system-modules
+  ;; Build-side modules imported by default.
+  `((nonguix build cuda-build-system)
+    (nonguix build utils)
+    ,@%binary-build-system-modules))
+
+(define (build-patchelf-plan wrapper-plan inputs)
+  #~(let ((patchelf-inputs
+           (list #$@(map car inputs))))
+      (map (lambda (file)
+             (cons file (cons* "out" patchelf-inputs)))
+           #$wrapper-plan)))
+
+(define (cuda-license name)
+  (license:nonfree
+   (format #f "\
+https://developer.download.nvidia.com/compute/cuda/redist/~a/LICENSE.txt"
+           (string-join (string-split name #\-) "_"))))
+
+(define (guix-system->cuda-system system)
+  (match system
+    ("x86_64-linux" "linux-x86_64")
+    ("aarch64-linux" "linux-aarch64")
+    ("powerpc64le-linux" "linux-ppc64le")
+    (_ #f)))
+
+(define (cuda-current-system)
+  (guix-system->cuda-system
+   (or (%current-target-system) (%current-system))))
+
+(define (cuda-module-url name version)
+  (let ((system (cuda-current-system))
+        (snake-name (string-join (string-split name #\-) "_")))
+    (format #f
+            "https://developer.download.nvidia.com/compute/cuda/redist\
+/~a/~a/~a-~a-~a-archive.tar.xz"
+            snake-name
+            system
+            snake-name
+            system
+            version)))
+
+(define* (lower name
+                #:key source inputs native-inputs outputs system target
+                (patchelf (default-patchelf))
+                (glibc (default-glibc))
+                #:allow-other-keys
+                #:rest arguments)
+  "Return a bag for NAME."
+  (define private-keywords
+    '(#:target #:patchelf #:inputs #:native-inputs))
+  (define host-inputs
+    `(,@(if source
+            `(("source" ,source))
+            '())
+
+      ("gcc:lib" ,gcc "lib")
+      ("glibc" ,glibc)
+
+      ,@inputs
+      ;; Keep the standard inputs of 'gnu-build-system'.
+      ,@(standard-packages)))
+
+  (and (not target)                     ;XXX: no cross-compilation
+       (bag
+         (name name)
+         (system system)
+         (host-inputs host-inputs)
+         (build-inputs `(("patchelf" ,patchelf)
+                         ,@native-inputs
+                         ;; If current system is i686, the *32 packages will be the
+                         ;; same as the non-32, but that's OK.
+                         ("libc32" ,(to32 glibc))))
+         (outputs outputs)
+         (build cuda-build)
+         (arguments (append
+                     (strip-keyword-arguments private-keywords arguments)
+                     (list #:wrap-inputs (alist-delete "source" host-inputs)))))))
+
+(define* (cuda-build name inputs
+                     #:key
+                     guile source wrap-inputs
+                     (outputs '("out"))
+                     (patchelf-inputs ''("gcc" "glibc"))
+                     (patchelf-plan ''())
+                     (install-plan ''(("." "./")))
+                     (search-paths '())
+                     (out-of-source? #t)
+                     (validate-runpath? #t)
+                     (patch-shebangs? #t)
+                     (strip-binaries? #t)
+                     (strip-flags ''("--strip-debug"))
+                     (strip-directories ''("lib" "lib64" "libexec"
+                                           "bin" "sbin"))
+                     (phases '(@ (nonguix build cuda-build-system)
+                                 %standard-phases))
+                     (system (%current-system))
+                     (imported-modules %cuda-build-system-modules)
+                     (modules '((nonguix build cuda-build-system)
+                                (guix build utils)
+                                (nonguix build utils)))
+                     (substitutable? #t)
+                     allowed-references
+                     disallowed-references)
+  "Build SOURCE using binary-build-system."
+  (define builder
+    (with-imported-modules imported-modules
+      #~(begin
+	  (use-modules #$@modules)
+
+	  #$(with-build-variables inputs outputs
+	      #~(cuda-build #:source #+source
+                            #:system #$system
+                            #:outputs %outputs
+                            #:inputs %build-inputs
+                            #:patchelf-inputs #$patchelf-inputs
+                            #:patchelf-plan #$patchelf-plan
+                            #:install-plan #$install-plan
+                            #:search-paths '#$(map search-path-specification->sexp
+                                                   search-paths)
+                            #:phases #$phases
+                            #:out-of-source? #$out-of-source?
+                            #:validate-runpath? #$validate-runpath?
+                            #:patch-shebangs? #$patch-shebangs?
+                            #:strip-binaries? #$strip-binaries?
+                            #:strip-flags #$strip-flags
+                            #:strip-directories #$strip-directories)))))
+
+  (mlet %store-monad ((guile (package->derivation (or guile (default-guile))
+                                                  system #:graft? #f)))
+    (gexp->derivation name builder
+                      #:system system
+                      #:target #f
+                      #:substitutable? substitutable?
+                      #:allowed-references allowed-references
+                      #:disallowed-references disallowed-references
+                      #:guile-for-build guile)))
+
+(define cuda-build-system
+  (build-system
+    (name 'cuda)
+    (description "The Cuda build system")
+    (lower lower)))
+
+;;; cuda.scm ends here
diff --git a/nonguix/build/binary-build-system.scm b/nonguix/build/binary-build-system.scm
index ccfc3eb..24f146f 100644
--- a/nonguix/build/binary-build-system.scm
+++ b/nonguix/build/binary-build-system.scm
@@ -3,6 +3,7 @@
 ;;; Copyright © 2022 Attila Lendvai <attila@lendvai.name>
 ;;; Copyright © 2023 Giacomo Leidi <goodoldpaul@autistici.org>
 ;;; Copyright © 2024 Ashish SHUKLA <ashish.is@lostca.se>
+;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>
 
 (define-module (nonguix build binary-build-system)
   #:use-module ((guix build gnu-build-system) #:prefix gnu:)
@@ -11,6 +12,7 @@
   #:use-module (ice-9 match)
   #:use-module (srfi srfi-1)
   #:export (%standard-phases
+            autopatchelf
             binary-build))
 
 ;; Commentary:
@@ -140,6 +142,27 @@ The inputs are optional when the file is an executable."
        patchelf-plan)))
   #t)
 
+(define* (autopatchelf #:key inputs outputs patchelf-plan patchelf-inputs
+                       #:allow-other-keys)
+  "Automatically build patchelf-plan if not defined, then run patchelf phase.
+
+The plan is the product of all elf-files with all inputs and \"out\"."
+  (if (equal? patchelf-plan '())
+      (let* ((elf-files (find-files
+                         "." (lambda (name stat)
+                               (and (elf-file? name)
+                                    (not (eq? 'symlink (stat:type stat)))))))
+             (plan (map (lambda (file)
+                          (list file (cons* "out" patchelf-inputs)))
+                        elf-files)))
+        (format #t "Applying patchelf-plan: ~a~%" plan)
+        (patchelf #:inputs inputs
+                  #:outputs outputs
+                  #:patchelf-plan plan))
+      (patchelf #:inputs inputs
+                #:outputs outputs
+                #:patchelf-plan patchelf-plan)))
+
 (define (deb-file? binary-file)
   (string-suffix? ".deb" binary-file))
 
diff --git a/nonguix/build/cuda-build-system.scm b/nonguix/build/cuda-build-system.scm
new file mode 100644
index 0000000..8b874f7
--- /dev/null
+++ b/nonguix/build/cuda-build-system.scm
@@ -0,0 +1,73 @@
+;;; SPDX-License-Identifier: GPL-3.0-or-later
+;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>
+
+(define-module (nonguix build cuda-build-system)
+  #:use-module ((guix build gnu-build-system) #:prefix gnu:)
+  #:use-module ((nonguix build binary-build-system) #:prefix binary:)
+  #:use-module (guix build utils)
+  #:use-module (nonguix build utils)
+  #:use-module (ice-9 ftw)
+  #:use-module (ice-9 match)
+  #:export (%standard-phases
+            cuda-build))
+
+;; Commentary:
+;;
+;; Builder-side code of the Cuda binary build procedure.
+;;
+;; Code:
+
+;;; XXX: Copied from upstream guix in tests/store-deduplication.scm
+(define (cartesian-product . lst)
+  "Return the Cartesian product of all the given lists."
+  (match lst
+    ((head)
+     (map list head))
+    ((head . rest)
+     (let ((others (apply cartesian-product rest)))
+       (apply append
+              (map
+               (lambda (init)
+                 (map (lambda (lst)
+                        (cons init lst))
+                      others))
+               head))))
+    (()
+     '())))
+
+(define* (install-pkg-config-files #:key outputs #:allow-other-keys)
+  (if (directory-exists? "pkg-config")
+      (with-directory-excursion "pkg-config"
+        (for-each
+         (match-lambda
+           ((output file)
+            (substitute* file
+              (("^cudaroot=.*")
+               (string-append "cudaroot=" output "\n"))
+              (("^libdir=.*")
+               (string-append "libdir=" output "/lib\n"))
+              (("^includedir=.*")
+               (string-append "includedir=" output "/include\n")))
+            (install-file file
+                          (string-append output "/share/pkg-config"))
+            (with-directory-excursion
+                (string-append output "/share/pkg-config")
+              (symlink (basename file)
+                       (string-append
+                        (string-take file (string-index file #\-)) ".pc")))))
+         (cartesian-product (map cdr outputs) (find-files "." "\\.pc"))))
+      (format #t "pkg-config directory doesn't exist, nothing to be done.~%")))
+
+(define %standard-phases
+  (modify-phases binary:%standard-phases
+    (replace 'patchelf binary:autopatchelf)
+    (add-after 'install 'install-static install-static-output)
+    (add-after 'install-static 'install-pkg-config-files
+      install-pkg-config-files)))
+
+(define* (cuda-build #:key inputs (phases %standard-phases)
+                       #:allow-other-keys #:rest args)
+  "Build the given package, applying all of PHASES in order."
+  (apply gnu:gnu-build #:inputs inputs #:phases phases args))
+
+;;; cuda-build-system.scm ends here
diff --git a/nonguix/build/utils.scm b/nonguix/build/utils.scm
index 4de2ac2..3cf2ad4 100644
--- a/nonguix/build/utils.scm
+++ b/nonguix/build/utils.scm
@@ -12,7 +12,8 @@
   #:export (64-bit?
             make-wrapper
             concatenate-files
-            build-paths-from-inputs))
+            build-paths-from-inputs
+            install-static-output))
 
 (define (64-bit? file)
   "Return true if ELF file is in 64-bit format, false otherwise.
@@ -97,3 +98,22 @@ contents:
   (call-with-output-file result
     (lambda (port)
       (for-each (cut dump <> port) files))))
+
+(define* (install-static-output #:key outputs #:allow-other-keys)
+  (let ((out (assoc-ref outputs "out"))
+        (static (assoc-ref outputs "static")))
+    (if static
+        (begin
+          (for-each
+           (lambda (file)
+             (if (eq? 'symlink (stat:type (lstat file)))
+                 (with-directory-excursion (string-append static "/lib")
+                   (symlink (basename (readlink file))
+                            (basename file)))
+                 (install-file file (string-append static "/lib")))
+             (delete-file file))
+           (find-files (string-append out "/lib") "\\.a$"))
+          (for-each
+           (cute install-file <> (string-append static "/include"))
+           (find-files (string-append out "/include"))))
+        (format #t "no static output, nothing to be done~%"))))