Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Created and documented a CUDA example #457

Merged
merged 9 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
import static org.bytedeco.cuda.global.nvrtc.*;
import static us.ihmc.perception.cuda.CUDATools.*;

/**
* This class is used to compile the CUDA code and retrieve kernels specified by the users.
*/
public class CUDAProgram implements AutoCloseable
{
private static final String[] DEFAULT_OPTIONS = {"-arch=" + getComputeVersion(), // Target fairly recent GPU architecture
Expand All @@ -27,6 +30,7 @@ public class CUDAProgram implements AutoCloseable

/**
* Construct a {@link CUDAProgram} with default compilation options
*
* @param programPath {@link Path} to the .cu file.
* @param headerPaths {@link Path}s to the header files included (with {@code #include}) in the .cu file.
*/
Expand All @@ -37,8 +41,9 @@ public CUDAProgram(Path programPath, Path... headerPaths)

/**
* Construct a {@link CUDAProgram} specifying the path to the .cu file, paths to the header files, and compilation options.
* @param programPath {@link Path} to the .cu file.
* @param headerPaths {@link Path}s to the header files included (with {@code #include}) in the .cu file.
*
* @param programPath {@link Path} to the .cu file.
* @param headerPaths {@link Path}s to the header files included (with {@code #include}) in the .cu file.
* @param compilationOptions List of compilation options
* (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
*/
Expand Down Expand Up @@ -72,16 +77,25 @@ public CUDAProgram(Path programPath, Path[] headerPaths, String... compilationOp
}
}

/**
* Construct a {@link CUDAProgram} with default compilation options
*
* @param programName The user-friendly name of the program.
* There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
* @param programCode The source code (i.e., the contents of the .cu file)
*/
public CUDAProgram(String programName, String programCode)
{
this(programName, programCode, null, null);
}

/**
* Construct a {@link CUDAProgram} with default compilation options.
* @param programName The name of the program (usually the file name with a .cu extension).
* @param programCode The program code (i.e. the contents of the .cu file).
* @param headerNames List of header names included (with {@code #include}) in the code.
*
* @param programName The user-friendly name of the program.
* There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
* @param programCode The program code (i.e., the contents of the .cu file).
* @param headerNames List of header names included (with {@code #include}) in the code.
* @param headerContents Contents of the headers included in the code.
*/
public CUDAProgram(String programName, String programCode, String[] headerNames, String[] headerContents)
Expand All @@ -91,10 +105,12 @@ public CUDAProgram(String programName, String programCode, String[] headerNames,

/**
* Construct a {@link CUDAProgram} specifying the name, code, header names, header contents, and compilation options.
* @param programName The name of the program (usually the file name with a .cu extension).
* @param programCode The program code (i.e. the contents of the .cu file).
* @param headerNames List of header names included (with {@code #include}) in the code.
* @param headerContents Contents of the headers included in the code.
*
* @param programName The user-friendly name of the program.
* There is no relation with this name to the kernel file; however, it's recommended to use the file name to avoid confusion
* @param programCode The program code (i.e., the contents of the .cu file).
* @param headerNames List of header names included (with {@code #include}) in the code.
* @param headerContents Contents of the headers included in the code.
* @param compilationOptions List of compilation options
* (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
*/
Expand Down Expand Up @@ -149,13 +165,14 @@ private void initialize(String programName, String programCode, String[] headerN

/**
* Compiles a CUDA program to an {@code _nvrtcProgram}.
* @param programName [IN] The name of the program (usually the file name with a .cu extension).
* @param programCode [IN] The program code (i.e. the contents of the .cu file).
* @param headerNames [IN] List of header names included (with {@code #include}) in the code.
* @param headerContents [IN] Contents of the headers included in the code.
*
* @param programName [IN] The name of the program (usually the file name with a .cu extension).
* @param programCode [IN] The program code (i.e. the contents of the .cu file).
* @param headerNames [IN] List of header names included (with {@code #include}) in the code.
* @param headerContents [IN] Contents of the headers included in the code.
* @param compilationOptions [IN] List of compilation options
* (You can see the available options <a href="https://docs.nvidia.com/cuda/nvrtc/index.html#supported-compile-options">here</a>)
* @param compiledProgram [OUT] The compiled program.
* @param compiledProgram [OUT] The compiled program.
*/
private static void compileProgram(String programName,
String programCode,
Expand Down Expand Up @@ -192,9 +209,10 @@ private static void compileProgram(String programName,

/**
* Retrieves and logs the compilation log of the compiled {@code program}.
* @param program The compiled program.
*
* @param program The compiled program.
* @param programName Name of the compiled program.
* @param logLevel Level at which to output the log.
* @param logLevel Level at which to output the log.
*/
private static void printProgramLog(_nvrtcProgram program, String programName, Level logLevel)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package us.ihmc.perception.cuda;

import org.bytedeco.cuda.cudart.CUstream_st;
import org.bytedeco.cuda.cudart.dim3;
import org.bytedeco.cuda.global.cudart;
import org.bytedeco.javacpp.FloatPointer;
import us.ihmc.log.LogTools;

import static org.bytedeco.cuda.global.cudart.*;

/**
* This is a simple example of a kernel that adds two arrays together. The ways in which kernels can be run vary such that you can create all the JavaCPP Pointers
* inside a try-with-resources. The user can create more threads on the GPU. This example attempted to keep things really simple and avoided most of that overhead.
*/
public class ExampleCUDAKernel
{
private static final String KERNEL_TO_ADD_THE_VALUES_OF_TWO_ARRAYS = """
extern "C"

__global__
void add_arrays(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
""";

// Even if you don't create a stream, CUDA will create a default one for you, so we make one ourselves to handle things better
ds58 marked this conversation as resolved.
Show resolved Hide resolved
public ExampleCUDAKernel()
{
// Note this name does NOT have to match the name of the program, however for readability its ideal to have them match.
String userFriendlyNameOfProgram = "userFriendlyNameOfProgram";
// Note this name DOES have to match the name of the CUDA kernel you want to run
String kernelName = "add_arrays";

// We create a stream to synchronize the method calls that happen on the GPU.
CUstream_st stream = new CUstream_st();
ds58 marked this conversation as resolved.
Show resolved Hide resolved
// Allocates the memory for the stream and makes sure the GPU knows about the stream
cudart.cudaStreamCreate(stream);

// The CUDAProgram is going to hold the kernel code
CUDAProgram program = new CUDAProgram(userFriendlyNameOfProgram, KERNEL_TO_ADD_THE_VALUES_OF_TWO_ARRAYS);
CUDAKernel kernel = program.loadKernel(kernelName);

// Primitive types can be passed directly into the kernel class.
// However, because we use it in different places, we have made this a variable
int arraySize = 5;

// Allocating memory for an array and populating it with values in the constructor.
// The values stored in these variables will be passed to the kernel
FloatPointer cpuArrayX = new FloatPointer(1.0f, 2.0f, 3.0f, 4.0f, 5.0f);
FloatPointer cpuArrayY = new FloatPointer(5.0f, 4.0f, 3.0f, 2.0f, 1.0f);

// These will be pointers to the gpu memory, where we will upload the data too.
FloatPointer gpuArrayX = new FloatPointer();
FloatPointer gpuArrayY = new FloatPointer();

// Allocate memory on the gpu, to allocate the right size we need to get the sizeof the datatype being passed to the gpu
cudaMallocAsync(gpuArrayX, (long) gpuArrayX.sizeof() * arraySize, stream);
cudaMallocAsync(gpuArrayY, (long) gpuArrayY.sizeof() * arraySize, stream);

// This variable is specific to CUDA, docs can be found online
// Feel free to try this link as well: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
int cudaDefaultValue = cudaMemcpyDefault;

// Copy the cpu data into the gpu
// (cpuArrayX.sizeof() * arraySize) in this case saying (byteSizeOfFloat * numberOfFloats)
cudaMemcpyAsync(gpuArrayX, cpuArrayX, (long) cpuArrayX.sizeof() * arraySize, cudaDefaultValue, stream);
cudaMemcpyAsync(gpuArrayY, cpuArrayY, (long) cpuArrayY.sizeof() * arraySize, cudaDefaultValue, stream);

// Now we are ready to run the kernel, we need to pass in the correct parameters
// The method call of the kernel looks like this: (void add_arrays(int n, float *x, float *y)) so it needs an (int, float pointer, float pointer)
// This runs on the GPU, so we need to pass in the data that is stored on the GPU (except for primitive types)
kernel.withInt(arraySize).withPointer(gpuArrayX).withPointer(gpuArrayY);

// In this example, our array's have 5 values, so if we wanted to run on 5 threads for the blockSize we would do: (new dim3(5,1,1))
kernel.run(stream, new dim3(), new dim3(), 0);

// At this point the kernel may have run or is running on the GPU, when it finishes we need to copy the result back to the CPU
// The kernel packs the result in the y array (y[i] = x[i] + y[i];) so we want to get those values on the CPU
// We are asking the GPU do copy the data to the CPU when its ready, there isn't a guarantee that this happens now
cudaMemcpyAsync(cpuArrayY, gpuArrayY, (long) gpuArrayY.sizeof() * arraySize, cudaDefaultValue, stream);

// Synchronize the stream
// This call waits until all asynchronous functions being executed on this stream finish.
// We have to call this to ensure that the above memcpy finished, and we have data back in Java land
cudaStreamSynchronize(stream);

// Free the memory on the GPU now that we are done with it. The data in on the CPU so we don't need it anymore
cudaFreeAsync(gpuArrayX, stream);
cudaFreeAsync(gpuArrayY, stream);

// Copy array Y to Java land, so we can get the data
float[] javaArrayY = new float[arraySize];
cpuArrayY.get(javaArrayY);
LogTools.info("Results: {}", javaArrayY);

// Since we didn't create these pointers in a try-with-resources, we have to close everything correctly
program.close();
kernel.close();

cpuArrayX.close();
cpuArrayY.close();
gpuArrayX.close();
gpuArrayY.close();

// At the end we have to destroy the stream to release the memory
cudart.cudaStreamDestroy(stream);
stream.close();
}

public static void main(String[] args)
{
new ExampleCUDAKernel();
}
}
Loading