ihmcrobotics · PotatoPeeler3000 · Nov 15, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/ihmc-perception/src/main/java/us/ihmc/perception/cuda/CUDAProgram.java b/ihmc-perception/src/main/java/us/ihmc/perception/cuda/CUDAProgram.java
@@ -17,6 +17,9 @@
 import static org.bytedeco.cuda.global.nvrtc.*;
 import static us.ihmc.perception.cuda.CUDATools.*;
 
+/**
+ * This class is used to compile the CUDA code and retrieve kernels specified by the users.
+ */
 public class CUDAProgram implements AutoCloseable
 {
    private static final String[] DEFAULT_OPTIONS = {"-arch=" + getComputeVersion(),  // Target fairly recent GPU architecture
@@ -27,6 +30,7 @@ public class CUDAProgram implements AutoCloseable
 
    /**
     * Construct a {@link CUDAProgram} with default compilation options
+    *
     * @param programPath {@link Path} to the .cu file.
     * @param headerPaths {@link Path}s to the header files included (with {@code #include}) in the .cu file.
     */
@@ -37,8 +41,9 @@ public CUDAProgram(Path programPath, Path... headerPaths)
 
    /**
     * Construct a {@link CUDAProgram} specifying the path to the .cu file, paths to the header files, and compilation options.
-    * @param programPath {@link Path} to the .cu file.
-    * @param headerPaths {@link Path}s to the header files included (with {@code #include}) in the .cu file.
+    *
+    * @param programPath        {@link Path} to the .cu file.
+    * @param headerPaths        {@link Path}s to the header files included (with {@code #include}) in the .cu file.
     * @param compilationOptions List of compilation options
     *                           (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
     */
@@ -72,16 +77,25 @@ public CUDAProgram(Path programPath, Path[] headerPaths, String... compilationOp
       }
    }
 
+   /**
+    * Construct a {@link CUDAProgram} with default compilation options
+    *
+    * @param programName The user-friendly name of the program.
+    *                    There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
+    * @param programCode The source code (i.e., the contents of the .cu file)
+    */
    public CUDAProgram(String programName, String programCode)
    {
       this(programName, programCode, null, null);
    }
 
    /**
     * Construct a {@link CUDAProgram} with default compilation options.
-    * @param programName The name of the program (usually the file name with a .cu extension).
-    * @param programCode The program code (i.e. the contents of the .cu file).
-    * @param headerNames List of header names included (with {@code #include}) in the code.
+    *
+    * @param programName    The user-friendly name of the program.
+    *                       There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
+    * @param programCode    The program code (i.e., the contents of the .cu file).
+    * @param headerNames    List of header names included (with {@code #include}) in the code.
     * @param headerContents Contents of the headers included in the code.
     */
    public CUDAProgram(String programName, String programCode, String[] headerNames, String[] headerContents)
@@ -91,10 +105,12 @@ public CUDAProgram(String programName, String programCode, String[] headerNames,
 
    /**
     * Construct a {@link CUDAProgram} specifying the name, code, header names, header contents, and compilation options.
-    * @param programName The name of the program (usually the file name with a .cu extension).
-    * @param programCode The program code (i.e. the contents of the .cu file).
-    * @param headerNames List of header names included (with {@code #include}) in the code.
-    * @param headerContents Contents of the headers included in the code.
+    *
+    * @param programName        The user-friendly name of the program.
+    *                           There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
+    * @param programCode        The program code (i.e., the contents of the .cu file).
+    * @param headerNames        List of header names included (with {@code #include}) in the code.
+    * @param headerContents     Contents of the headers included in the code.
     * @param compilationOptions List of compilation options
     *                           (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
     */
@@ -149,13 +165,14 @@ private void initialize(String programName, String programCode, String[] headerN
 
    /**
     * Compiles a CUDA program to an {@code _nvrtcProgram}.
-    * @param programName [IN] The name of the program (usually the file name with a .cu extension).
-    * @param programCode [IN] The program code (i.e. the contents of the .cu file).
-    * @param headerNames [IN] List of header names included (with {@code #include}) in the code.
-    * @param headerContents [IN] Contents of the headers included in the code.
+    *
+    * @param programName        [IN] The name of the program (usually the file name with a .cu extension).
+    * @param programCode        [IN] The program code (i.e. the contents of the .cu file).
+    * @param headerNames        [IN] List of header names included (with {@code #include}) in the code.
+    * @param headerContents     [IN] Contents of the headers included in the code.
     * @param compilationOptions [IN] List of compilation options
     *                           (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
-    * @param compiledProgram [OUT] The compiled program.
+    * @param compiledProgram    [OUT] The compiled program.
     */
    private static void compileProgram(String programName,
                                       String programCode,
@@ -192,9 +209,10 @@ private static void compileProgram(String programName,
 
    /**
     * Retrieves and logs the compilation log of the compiled {@code program}.
-    * @param program The compiled program.
+    *
+    * @param program     The compiled program.
     * @param programName Name of the compiled program.
-    * @param logLevel Level at which to output the log.
+    * @param logLevel    Level at which to output the log.
     */
    private static void printProgramLog(_nvrtcProgram program, String programName, Level logLevel)
    {

diff --git a/ihmc-perception/src/test/java/us/ihmc/perception/cuda/ExampleCUDAKernel.java b/ihmc-perception/src/test/java/us/ihmc/perception/cuda/ExampleCUDAKernel.java
@@ -0,0 +1,118 @@
+package us.ihmc.perception.cuda;
+
+import org.bytedeco.cuda.cudart.CUstream_st;
+import org.bytedeco.cuda.cudart.dim3;
+import org.bytedeco.cuda.global.cudart;
+import org.bytedeco.javacpp.FloatPointer;
+import us.ihmc.log.LogTools;
+
+import static org.bytedeco.cuda.global.cudart.*;
+
+/**
+ * This is a simple example of a kernel that adds two arrays together. The ways in which kernels can be run vary such that you can create all the JavaCPP Pointers
+ * inside a try-with-resources. The user can create more threads on the GPU. This example attempted to keep things really simple and avoided most of that overhead.
+ */
+public class ExampleCUDAKernel
+{
+   private static final String KERNEL_TO_ADD_THE_VALUES_OF_TWO_ARRAYS = """
+         extern "C"
+
+         __global__
+         void add_arrays(int n, float *x, float *y)
+         {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            int stride = blockDim.x * gridDim.x;
+            for (int i = index; i < n; i += stride)
+               y[i] = x[i] + y[i];
+         }
+         """;
+
+   // Even if you don't create a stream, CUDA will create a default one for you, so we make one ourselves to handle things better
+   public ExampleCUDAKernel()
+   {
+      // Note this name does NOT have to match the name of the program, however for readability its ideal to have them match.
+      String userFriendlyNameOfProgram = "userFriendlyNameOfProgram";
+      // Note this name DOES have to match the name of the CUDA kernel you want to run
+      String kernelName = "add_arrays";
+
+      // We create a stream to synchronize the method calls that happen on the GPU.
+      CUstream_st stream = new CUstream_st();
+      // Allocates the memory for the stream and makes sure the GPU knows about the stream
+      cudart.cudaStreamCreate(stream);
+
+      // The CUDAProgram is going to hold the kernel code
+      CUDAProgram program = new CUDAProgram(userFriendlyNameOfProgram, KERNEL_TO_ADD_THE_VALUES_OF_TWO_ARRAYS);
+      CUDAKernel kernel = program.loadKernel(kernelName);
+
+      // Primitive types can be passed directly into the kernel class.
+      // However, because we use it in different places, we have made this a variable
+      int arraySize = 5;
+
+      // Allocating memory for an array and populating it with values in the constructor.
+      // The values stored in these variables will be passed to the kernel
+      FloatPointer cpuArrayX = new FloatPointer(1.0f, 2.0f, 3.0f, 4.0f, 5.0f);
+      FloatPointer cpuArrayY = new FloatPointer(5.0f, 4.0f, 3.0f, 2.0f, 1.0f);
+
+      // These will be pointers to the gpu memory, where we will upload the data too.
+      FloatPointer gpuArrayX = new FloatPointer();
+      FloatPointer gpuArrayY = new FloatPointer();
+
+      // Allocate memory on the gpu, to allocate the right size we need to get the sizeof the datatype being passed to the gpu
+      cudaMallocAsync(gpuArrayX, (long) gpuArrayX.sizeof() * arraySize, stream);
+      cudaMallocAsync(gpuArrayY, (long) gpuArrayY.sizeof() * arraySize, stream);
+
+      // This variable is specific to CUDA, docs can be found online
+      // Feel free to try this link as well: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
+      int cudaDefaultValue = cudaMemcpyDefault;
+
+      // Copy the cpu data into the gpu
+      // (cpuArrayX.sizeof() * arraySize) in this case saying (byteSizeOfFloat * numberOfFloats)
+      cudaMemcpyAsync(gpuArrayX, cpuArrayX, (long) cpuArrayX.sizeof() * arraySize, cudaDefaultValue, stream);
+      cudaMemcpyAsync(gpuArrayY, cpuArrayY, (long) cpuArrayY.sizeof() * arraySize, cudaDefaultValue, stream);
+
+      // Now we are ready to run the kernel, we need to pass in the correct parameters
+      // The method call of the kernel looks like this: (void add_arrays(int n, float *x, float *y)) so it needs an (int, float pointer, float pointer)
+      // This runs on the GPU, so we need to pass in the data that is stored on the GPU (except for primitive types)
+      kernel.withInt(arraySize).withPointer(gpuArrayX).withPointer(gpuArrayY);
+
+      // In this example, our array's have 5 values, so if we wanted to run on 5 threads for the blockSize we would do: (new dim3(5,1,1))
+      kernel.run(stream, new dim3(), new dim3(), 0);
+
+      // At this point the kernel may have run or is running on the GPU, when it finishes we need to copy the result back to the CPU
+      // The kernel packs the result in the y array (y[i] = x[i] + y[i];) so we want to get those values on the CPU
+      // We are asking the GPU do copy the data to the CPU when its ready, there isn't a guarantee that this happens now
+      cudaMemcpyAsync(cpuArrayY, gpuArrayY, (long) gpuArrayY.sizeof() * arraySize, cudaDefaultValue, stream);
+
+      // Synchronize the stream
+      // This call waits until all asynchronous functions being executed on this stream finish.
+      // We have to call this to ensure that the above memcpy finished, and we have data back in Java land
+      cudaStreamSynchronize(stream);
+
+      // Free the memory on the GPU now that we are done with it. The data in on the CPU so we don't need it anymore
+      cudaFreeAsync(gpuArrayX, stream);
+      cudaFreeAsync(gpuArrayY, stream);
+
+      // Copy array Y to Java land, so we can get the data
+      float[] javaArrayY = new float[arraySize];
+      cpuArrayY.get(javaArrayY);
+      LogTools.info("Results: {}", javaArrayY);
+
+      // Since we didn't create these pointers in a try-with-resources, we have to close everything correctly
+      program.close();
+      kernel.close();
+
+      cpuArrayX.close();
+      cpuArrayY.close();
+      gpuArrayX.close();
+      gpuArrayY.close();
+
+      // At the end we have to destroy the stream to release the memory
+      cudart.cudaStreamDestroy(stream);
+      stream.close();
+   }
+
+   public static void main(String[] args)
+   {
+      new ExampleCUDAKernel();
+   }
+}