Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MPS support with fused kernels #76

Merged
merged 20 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# MacOS
.DS_Store

# build
build/
.idea/
.vscode/
.vscode/
42 changes: 41 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ cmake_minimum_required(VERSION 3.21)
project(opensplat)

set(OPENSPLAT_BUILD_SIMPLE_TRAINER OFF CACHE BOOL "Build simple trainer applications")
set(GPU_RUNTIME "CUDA" CACHE STRING "HIP or CUDA")
set(GPU_RUNTIME "CUDA" CACHE STRING "HIP or CUDA or MPS")
set(OPENCV_DIR "OPENCV_DIR-NOTFOUND" CACHE PATH "Path to the OPENCV installation directory")
set(OPENSPLAT_MAX_CUDA_COMPATIBILITY OFF CACHE BOOL "Build for maximum CUDA device compatibility")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
Expand Down Expand Up @@ -81,6 +82,16 @@ elseif(GPU_RUNTIME STREQUAL "HIP")
set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
endif()
list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
elseif(GPU_RUNTIME STREQUAL "MPS")
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")

set(XC_FLAGS -O3)
set(USE_MPS ON CACHE BOOL "Use MPS for GPU acceleration")
else()
set(GPU_RUNTIME "CPU")
endif()

set(CMAKE_CXX_STANDARD 17)
Expand Down Expand Up @@ -119,6 +130,31 @@ if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
${TORCH_INCLUDE_DIRS}
)
set_target_properties(gsplat PROPERTIES LINKER_LANGUAGE CXX)
elseif(GPU_RUNTIME STREQUAL "MPS")
add_library(gsplat vendor/gsplat-metal/gsplat_metal.mm)
list(APPEND GSPLAT_LIBS gsplat)
target_link_libraries(gsplat PRIVATE
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
target_include_directories(gsplat PRIVATE ${TORCH_INCLUDE_DIRS})
# copy shader files to bin directory
configure_file(vendor/gsplat-metal/gsplat_metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal COPYONLY)
add_custom_command(
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air
COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal
DEPENDS vendor/gsplat-metal/gsplat_metal.metal
COMMENT "Compiling Metal kernels"
)

add_custom_target(
gsplat_metal ALL
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
)
endif()

add_library(gsplat_cpu vendor/gsplat-cpu/gsplat_cpu.cpp)
Expand All @@ -135,6 +171,8 @@ if(GPU_RUNTIME STREQUAL "HIP")
target_compile_definitions(opensplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
elseif(GPU_RUNTIME STREQUAL "CUDA")
target_compile_definitions(opensplat PRIVATE USE_CUDA)
elseif(GPU_RUNTIME STREQUAL "MPS")
target_compile_definitions(opensplat PRIVATE USE_MPS)
endif()

if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
Expand All @@ -149,6 +187,8 @@ if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
target_compile_definitions(simple_trainer PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
elseif(GPU_RUNTIME STREQUAL "CUDA")
target_compile_definitions(simple_trainer PRIVATE USE_CUDA)
elseif(GPU_RUNTIME STREQUAL "MPS")
target_compile_definitions(simple_trainer PRIVATE USE_MPS)
endif()
endif()

Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,23 @@ brew install opencv
brew install pytorch
```

You will also need to install Xcode and the Xcode command line tools to compile with metal support (otherwise, OpenSplat will build with CPU acceleration only):
1. Install Xcode from the Apple App Store.
2. Install the command line tools with `xcode-select --install`. This might do nothing on your machine.
3. If `xcode-select --print-path` prints `/Library/Developer/CommandLineTools`,then run `sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer`.

Then run:

```
git clone https://github.com/pierotofy/OpenSplat OpenSplat
cd OpenSplat
mkdir build && cd build
cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ .. && make -j$(nproc)
cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ -DGPU_RUNTIME=MPS .. && make -j$(sysctl -n hw.logicalcpu)
./opensplat
```

If building CPU-only, remove `-DGPU_RUNTIME=MPS`.

:warning: You will probably get a *libc10.dylib can’t be opened because Apple cannot check it for malicious software* error on first run. Open **System Settings** and go to **Privacy & Security** and find the **Allow** button. You might need to repeat this several times until all torch libraries are loaded.

## Docker Build
Expand Down Expand Up @@ -234,7 +241,6 @@ We recently released OpenSplat, so there's lots of work to do.

* Support for running on AMD cards (more testing needed)
* Improve speed / reduce memory usage
* Add Metal support on macOS
* Distributed computation using multiple machines
* Real-time training viewer output
* Compressed scene outputs
Expand Down
4 changes: 4 additions & 0 deletions gsplat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
#include "vendor/gsplat/bindings.h"
#endif

#if defined(USE_MPS)
#include "vendor/gsplat-metal/bindings.h"
#endif

#include "vendor/gsplat-cpu/bindings.h"

#endif
6 changes: 3 additions & 3 deletions model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
cov2d = p[3];
camDepths = p[4];
}else{
#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

TileBounds tileBounds = std::make_tuple((width + BLOCK_X - 1) / BLOCK_X,
(height + BLOCK_Y - 1) / BLOCK_Y,
Expand Down Expand Up @@ -152,7 +152,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
if (device == torch::kCPU){
rgbs = SphericalHarmonicsCPU::apply(degreesToUse, viewDirs, colors);
}else{
#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
rgbs = SphericalHarmonics::apply(degreesToUse, viewDirs, colors);
#endif
}
Expand All @@ -172,7 +172,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
width,
backgroundColor);
}else{
#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
rgb = RasterizeGaussians::apply(
xys,
depths,
Expand Down
5 changes: 4 additions & 1 deletion opensplat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@ int main(int argc, char *argv[]){
torch::Device device = torch::kCPU;
int displayStep = 1;

if (torch::cuda::is_available() && result.count("cpu") == 0) {
if (torch::hasCUDA() && result.count("cpu") == 0) {
std::cout << "Using CUDA" << std::endl;
device = torch::kCUDA;
displayStep = 10;
} else if (torch::hasMPS() && result.count("cpu") == 0) {
std::cout << "Using MPS" << std::endl;
device = torch::kMPS;
}else{
std::cout << "Using CPU" << std::endl;
}
Expand Down
2 changes: 1 addition & 1 deletion project_gaussians.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "project_gaussians.hpp"

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

variable_list ProjectGaussians::forward(AutogradContext *ctx,
torch::Tensor means,
Expand Down
2 changes: 1 addition & 1 deletion project_gaussians.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

using namespace torch::autograd;

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

class ProjectGaussians : public Function<ProjectGaussians>{
public:
Expand Down
4 changes: 2 additions & 2 deletions rasterize_gaussians.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "rasterize_gaussians.hpp"
#include "gsplat.hpp"

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

std::tuple<torch::Tensor,
torch::Tensor,
Expand Down Expand Up @@ -171,9 +171,9 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
torch::Tensor finalTs = std::get<1>(t);
std::vector<int32_t> *px2gid = std::get<2>(t);

ctx->saved_data["px2gid"] = reinterpret_cast<int64_t>(px2gid);
ctx->saved_data["imgWidth"] = imgWidth;
ctx->saved_data["imgHeight"] = imgHeight;
ctx->saved_data["px2gid"] = reinterpret_cast<int64_t>(px2gid);
ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs });

return outImg;
Expand Down
2 changes: 1 addition & 1 deletion rasterize_gaussians.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

using namespace torch::autograd;

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

std::tuple<torch::Tensor,
torch::Tensor,
Expand Down
5 changes: 4 additions & 1 deletion simple_trainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ int main(int argc, char **argv){
if (torch::cuda::is_available() && result.count("cpu") == 0){
std::cout << "Using CUDA" << std::endl;
device = torch::kCUDA;
}else if(torch::mps::is_available() && result.count("cpu") == 0){
std::cout << "Using MPS" << std::endl;
device = torch::kMPS;
}else{
std::cout << "Using CPU" << std::endl;
}
Expand Down Expand Up @@ -160,7 +163,7 @@ int main(int argc, char **argv){
width,
background);
}else{
#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
auto p = ProjectGaussians::apply(means, scales, 1,
quats, viewMat, viewMat,
focal, focal,
Expand Down
2 changes: 1 addition & 1 deletion spherical_harmonics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ torch::Tensor sh2rgb(const torch::Tensor &sh){
return (sh * C0) + 0.5;
}

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

torch::Tensor SphericalHarmonics::forward(AutogradContext *ctx,
int degreesToUse,
Expand Down
2 changes: 1 addition & 1 deletion spherical_harmonics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ int degFromSh(int numBases);
torch::Tensor rgb2sh(const torch::Tensor &rgb);
torch::Tensor sh2rgb(const torch::Tensor &sh);

#if defined(USE_HIP) || defined(USE_CUDA)
#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)

class SphericalHarmonics : public Function<SphericalHarmonics>{
public:
Expand Down
Loading
Loading