;;; SPDX-License-Identifier: GPL-3.0-or-later
;;; Copyright © 2024 Nicolas Graves <ngraves@ngraves.fr>

(define-module (nongnu packages machine-learning)
  #:use-module ((guix licenses) #:prefix license:)
  #:use-module (guix gexp)
  #:use-module (guix packages)
  #:use-module (guix utils)
  #:use-module (guix build-system cmake)
  #:use-module (guix build-system copy)
  #:use-module (guix build-system gnu)
  #:use-module (guix build-system python)
  #:use-module (guix git-download)
  #:use-module (gnu packages)
  #:use-module (gnu packages check)
  #:use-module (gnu packages cpp)
  #:use-module (gnu packages libevent)
  #:use-module (gnu packages machine-learning)
  #:use-module (gnu packages pkg-config)
  #:use-module (gnu packages python-xyz)
  #:use-module (gnu packages serialization)
  #:use-module (nongnu packages nvidia)
  #:use-module (ice-9 match))

(define-public gloo-cuda
  (let ((version "0.0.0")                         ; no proper version tag
        (commit "e6d509b527712a143996f2f59a10480efa804f8b")
        (revision "2"))
    (package
      (name "gloo-cuda")
      (version (git-version version revision commit))
      (source
       (origin
         (method git-fetch)
         (uri (git-reference
               (url "https://github.com/facebookincubator/gloo")
               (commit commit)))
         (file-name (git-file-name name version))
         (sha256
          (base32
           "11ywsn1lrd1cpzm1iwvin2c5l962zib5bd852vl54bp12a0w6klj"))))
      (build-system cmake-build-system)
      (native-inputs
       (list googletest))
      (inputs
       (modify-inputs (package-inputs gloo)
         (append cuda-toolkit nvidia-nccl)))
      (arguments
       (substitute-keyword-arguments (package-arguments gloo)
         ((#:configure-flags flags ''())
          #~(cons "-DUSE_CUDA=ON" #$flags))))
      (synopsis "Collective communications library")
      (description
       "Gloo is a collective communications library.  It comes with a
number of collective algorithms useful for machine learning applications.
These include a barrier, broadcast, and allreduce.

Note: This package provides NVIDIA GPU support.")
      (home-page "https://github.com/facebookincubator/gloo")
      (license license:bsd-3))))

(define %python-pytorch-version "2.4.0")

(define %python-pytorch-src
  (origin
    (method git-fetch)
    (uri (git-reference
          (url "https://github.com/pytorch/pytorch")
          (commit (string-append "v" %python-pytorch-version))))
    (file-name (git-file-name "python-pytorch" %python-pytorch-version))
    (sha256
     (base32
      "18hdhzr12brj0b7ppyiscax0dbra30207qx0cckw78midfkcn7cn"))
    (patches (search-patches "python-pytorch-system-libraries.patch"
                             "python-pytorch-runpath.patch"
                             "python-pytorch-without-kineto.patch"
                             ;; Some autogeneration scripts depend on the
                             ;; compile PyTorch library. Therefore, we create
                             ;; dummy versions which are regenerated later.
                             "python-pytorch-fix-codegen.patch"))
    (modules '((guix build utils)))
    (snippet
     '(begin
        ;; Bundled or unused code
        (for-each
         (lambda (dir)
           (when (file-exists? dir)
             (delete-file-recursively dir)))
         '("android"
           ;; "aten/src/ATen/native/cuda/cutlass_extensions"
           "aten/src/ATen/native/quantized/cpu/qnnpack"
           "caffe2/mobile/contrib/libopencl-stub"
           "caffe2/mobile/contrib/libvulkan-stub"
           "third_party"))

        ;; Autogenerated files
        (for-each
         delete-file
         '("aten/src/ATen/nnapi/nnapi_wrapper.cpp"
           "aten/src/ATen/nnapi/nnapi_wrapper.h"
           ;; These files contain just lists of floating point values and
           ;; might be as well hand-written.
           ;; "test/cpp/api/init_baseline.h"
           ;; "test/cpp/api/optim_baseline.h"
           "test/mobile/test_upgrader_bytecode_table_example.cpp"
           "torch/csrc/jit/mobile/upgrader_mobile.cpp"
           "torch/csrc/jit/runtime/decomposition_registry_util.cpp"
           "torch/csrc/jit/runtime/serialized_shape_function_registry.cpp"
           "torch/csrc/jit/tensorexpr/external_functions_codegen.cpp"
           "torch/csrc/jit/serialization/mobile_bytecode_generated.h"))
        (delete-file-recursively ".github")
        ;; These files are needed for CUDA.
        ;; (for-each
        ;;  (lambda (dir)
        ;;    (for-each
        ;;     delete-file
        ;;     (find-files dir "\\.cu$")))
        ;;  '("aten/src/ATen/native/transformers/cuda/flash_attn/kernels"
        ;;    "aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels"))
        ))))

(define-public python-pytorch-cuda
  (package
    (name "python-pytorch-cuda")
    (version %python-pytorch-version)
    (source %python-pytorch-src)
    (build-system python-build-system)
    (arguments
     (substitute-keyword-arguments (package-arguments python-pytorch)
       ((#:phases phases)
        #~(modify-phases #$phases
            (add-after 'cmake-patches 'cuda-cmake-patches
              (lambda _
                ;; XXX: Currently nvidia-cudnn-frontend doesn't install CMake
                ;; configuration files, we must add unbundled nlohmann-json.
                ;; Additionally, it won't work without CUDNN_INCLUDE_DIR.
                (substitute* "cmake/Dependencies.cmake"
                  (("set\\(CUDNN_FRONTEND_INCLUDE_DIR.*$")
                   (format #f "set(CUDNN_FRONTEND_INCLUDE_DIR ~a/include)
  target_include_directories(torch::cudnn INTERFACE
      ${CUDNN_INCLUDE_DIR} ${~a/include}
  )~%"
                           #$(this-package-input "nvidia-cudnn-frontend")
                           #$(this-package-input "nlohmann-json"))))
                ;; XXX: Link the right include dir for cutlass.
                (substitute* "aten/src/ATen/CMakeLists.txt"
                  (("\
\\$\\{CMAKE_CURRENT_SOURCE_DIR\\}/\\.\\./\\.\\./\\.\\./third_party/cutlass")
                   #$(this-package-input "nvidia-cutlass")))
                ;; XXX: Not linking gtest+gtest_main breaks compilation
                (substitute* '("c10/cuda/test/CMakeLists.txt"
                               "caffe2/CMakeLists.txt")
                  (("target_link_libraries\\((.* gtest_main)\\)" all content)
                   (format #f "target_link_libraries(~a gtest)"
                           content)))))
            (add-after 'use-system-libraries 'use-cuda-libraries
              (lambda _
                (setenv "USE_CUDA" "1")
                (setenv "CUDA_HOME" #$(this-package-input "cuda-dev"))
                (setenv "CUDA_TOOLKIT_ROOT_DIR"
                        #$(this-package-input "cuda-dev"))
                (setenv "CUDA_USE_STATIC_CUDA_RUNTIME" "0")
                (setenv "CUDA_PROPAGATE_HOST_FLAGS" "0")
                (setenv "CUSPARSELT_LIBRARY"
                        #$(file-append
                           (this-package-input "cuda-dev") "/lib"))
                (setenv "CUSPARSELT_INCLUDE_DIR"
                        #$(file-append
                           (this-package-input "cuda-dev") "/include"))
                (setenv "USE_CUDNN" "1")
                (setenv "CUDNN_LIB_DIR"
                        #$(file-append
                           (this-package-input "nvidia-cudnn") "/lib"))
                (setenv "CUDNN_INCLUDE_DIR"
                        #$(file-append
                           (this-package-input "nvidia-cudnn") "/include"))
                ;; XXX: 3.5, 5.0 and 9.0a break tests compilation
                ;; See https://github.com/pytorch/pytorch/issues/113948
                (setenv "TORCH_CUDA_ARCH_LIST" "8.0 8.6 8.9 9.0")
                ;; XXX: Current cutlass package doesn't have necessary
                ;; headers to enable this option.
                (setenv "USE_ROCM" "0")))))))
    (native-inputs (package-native-inputs python-pytorch))
    (inputs
     (modify-inputs (package-inputs python-pytorch)
       (replace "tensorpipe" tensorpipe-cuda)
       (replace "gloo" gloo-cuda)
       (append nvidia-cudnn
               nvidia-cudnn-frontend
               cuda-dev
               nlohmann-json
               nvidia-cutlass
               nvidia-nccl)))
    (propagated-inputs (package-propagated-inputs python-pytorch))
    (home-page "https://pytorch.org/")
    (synopsis "Python library for tensor computation and deep neural networks")
    (description
     "PyTorch is a Python package that provides two high-level features:

@itemize
@item tensor computation (like NumPy) with strong GPU acceleration;
@item deep neural networks (DNNs) built on a tape-based autograd system.
@end itemize

You can reuse Python packages such as NumPy, SciPy, and Cython to extend
PyTorch when needed.

Note: This package provides NVIDIA GPU support.")
    (license license:bsd-3)))

(define-public tensorpipe-cuda
  (package
    (name "tensorpipe-cuda")
    (version (package-version tensorpipe))
    (source (package-source tensorpipe))
    (build-system cmake-build-system)
    (arguments
     (list
      #:configure-flags
      ''("-DBUILD_SHARED_LIBS=ON" "-DTP_USE_CUDA=1")
        ;; There are no tests
        #:tests? #f))
    (inputs (list cuda-nvml-dev cuda-toolkit libuv))
    (native-inputs (list googletest pkg-config pybind11 libnop))
    (home-page "https://github.com/pytorch/tensorpipe")
    (synopsis "Tensor-aware point-to-point communication primitive for
machine learning")
    (description "TensorPipe provides a tensor-aware channel to transfer
rich objects from one process to another while using the fastest transport for
the tensors contained therein.
Note: This version includes NVIDIA CUDA API and headers.")
    (license license:bsd-3)))