pierotofy · pierotofy · Apr 15, 2024 · Mar 24, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+# MacOS
+.DS_Store
+
+# build
 build/
 .idea/
-.vscode/
+.vscode/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,9 +2,10 @@ cmake_minimum_required(VERSION 3.21)
 project(opensplat)
 
 set(OPENSPLAT_BUILD_SIMPLE_TRAINER OFF CACHE BOOL "Build simple trainer applications")
-set(GPU_RUNTIME "CUDA" CACHE STRING "HIP or CUDA")
+set(GPU_RUNTIME "CUDA" CACHE STRING "HIP or CUDA or MPS")
 set(OPENCV_DIR "OPENCV_DIR-NOTFOUND" CACHE PATH "Path to the OPENCV installation directory")
 set(OPENSPLAT_MAX_CUDA_COMPATIBILITY OFF CACHE BOOL "Build for maximum CUDA device compatibility")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." FORCE)
@@ -81,6 +82,16 @@ elseif(GPU_RUNTIME STREQUAL "HIP")
         set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation")
     endif()
     list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+elseif(GPU_RUNTIME STREQUAL "MPS")
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    message(STATUS "Metal framework found")
+
+    set(XC_FLAGS -O3)
+    set(USE_MPS ON CACHE BOOL "Use MPS for GPU acceleration")
+else()
+    set(GPU_RUNTIME "CPU")
 endif()
 
 set(CMAKE_CXX_STANDARD 17)
@@ -119,6 +130,31 @@ if((GPU_RUNTIME STREQUAL "CUDA") OR (GPU_RUNTIME STREQUAL "HIP"))
         ${TORCH_INCLUDE_DIRS}
     )
     set_target_properties(gsplat PROPERTIES LINKER_LANGUAGE CXX)
+elseif(GPU_RUNTIME STREQUAL "MPS")    
+    add_library(gsplat vendor/gsplat-metal/gsplat_metal.mm)
+    list(APPEND GSPLAT_LIBS gsplat)
+    target_link_libraries(gsplat PRIVATE 
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+    )
+    target_include_directories(gsplat PRIVATE ${TORCH_INCLUDE_DIRS})
+    # copy shader files to bin directory
+    configure_file(vendor/gsplat-metal/gsplat_metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal COPYONLY)
+    add_custom_command(
+        OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air
+        COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.air
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/gsplat_metal.metal
+        DEPENDS vendor/gsplat-metal/gsplat_metal.metal
+        COMMENT "Compiling Metal kernels"
+    )
+
+    add_custom_target(
+        gsplat_metal ALL
+        DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+    )
 endif()
 
 add_library(gsplat_cpu vendor/gsplat-cpu/gsplat_cpu.cpp)
@@ -135,6 +171,8 @@ if(GPU_RUNTIME STREQUAL "HIP")
     target_compile_definitions(opensplat PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
 elseif(GPU_RUNTIME STREQUAL "CUDA")
     target_compile_definitions(opensplat PRIVATE USE_CUDA)
+elseif(GPU_RUNTIME STREQUAL "MPS")
+    target_compile_definitions(opensplat PRIVATE USE_MPS)
 endif()
 
 if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
@@ -149,6 +187,8 @@ if(OPENSPLAT_BUILD_SIMPLE_TRAINER)
         target_compile_definitions(simple_trainer PRIVATE USE_HIP __HIP_PLATFORM_AMD__)
     elseif(GPU_RUNTIME STREQUAL "CUDA")
         target_compile_definitions(simple_trainer PRIVATE USE_CUDA)
+    elseif(GPU_RUNTIME STREQUAL "MPS")
+        target_compile_definitions(simple_trainer PRIVATE USE_MPS)
     endif()
 endif()
 

diff --git a/README.md b/README.md
@@ -121,16 +121,23 @@ brew install opencv
 brew install pytorch
 ```
 
+You will also need to install Xcode and the Xcode command line tools to compile with metal support (otherwise, OpenSplat will build with CPU acceleration only):
+1. Install Xcode from the Apple App Store.
+2. Install the command line tools with `xcode-select --install`. This might do nothing on your machine.
+3. If `xcode-select --print-path` prints `/Library/Developer/CommandLineTools`,then run `sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer`.
+
 Then run:
 
 ```
 git clone https://github.com/pierotofy/OpenSplat OpenSplat
 cd OpenSplat
 mkdir build && cd build
-cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ .. && make -j$(nproc)
+cmake -DCMAKE_PREFIX_PATH=/path/to/libtorch/ -DGPU_RUNTIME=MPS .. && make -j$(sysctl -n hw.logicalcpu)
 ./opensplat
 ```
 
+If building CPU-only, remove `-DGPU_RUNTIME=MPS`.
+
 :warning: You will probably get a *libc10.dylib can’t be opened because Apple cannot check it for malicious software* error on first run. Open **System Settings** and go to **Privacy & Security** and find the **Allow** button. You might need to repeat this several times until all torch libraries are loaded.
 
 ## Docker Build
@@ -234,7 +241,6 @@ We recently released OpenSplat, so there's lots of work to do.
 
  * Support for running on AMD cards (more testing needed)
  * Improve speed / reduce memory usage
- * Add Metal support on macOS
  * Distributed computation using multiple machines
  * Real-time training viewer output
  * Compressed scene outputs

diff --git a/gsplat.hpp b/gsplat.hpp
@@ -7,6 +7,10 @@
 #include "vendor/gsplat/bindings.h"
 #endif
 
+#if defined(USE_MPS)
+#include "vendor/gsplat-metal/bindings.h"
+#endif
+
 #include "vendor/gsplat-cpu/bindings.h"
 
 #endif
diff --git a/model.cpp b/model.cpp
@@ -108,7 +108,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
         cov2d = p[3];
         camDepths = p[4];
     }else{
-        #if defined(USE_HIP) || defined(USE_CUDA)
+        #if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
         TileBounds tileBounds = std::make_tuple((width + BLOCK_X - 1) / BLOCK_X,
                         (height + BLOCK_Y - 1) / BLOCK_Y,
@@ -152,7 +152,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
     if (device == torch::kCPU){
         rgbs = SphericalHarmonicsCPU::apply(degreesToUse, viewDirs, colors);
     }else{
-        #if defined(USE_HIP) || defined(USE_CUDA)
+        #if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
         rgbs = SphericalHarmonics::apply(degreesToUse, viewDirs, colors);
         #endif
     }
@@ -172,7 +172,7 @@ torch::Tensor Model::forward(Camera& cam, int step){
                 width,
                 backgroundColor);
     }else{  
-        #if defined(USE_HIP) || defined(USE_CUDA)
+        #if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
         rgb = RasterizeGaussians::apply(
                 xys,
                 depths,

diff --git a/opensplat.cpp b/opensplat.cpp
@@ -81,10 +81,13 @@ int main(int argc, char *argv[]){
     torch::Device device = torch::kCPU;
     int displayStep = 1;
 
-    if (torch::cuda::is_available() && result.count("cpu") == 0) {
+    if (torch::hasCUDA() && result.count("cpu") == 0) {
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
         displayStep = 10;
+    } else if (torch::hasMPS() && result.count("cpu") == 0) {
+        std::cout << "Using MPS" << std::endl;
+        device = torch::kMPS;
     }else{
         std::cout << "Using CPU" << std::endl;
     }

diff --git a/project_gaussians.cpp b/project_gaussians.cpp
@@ -1,6 +1,6 @@
 #include "project_gaussians.hpp"
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 variable_list ProjectGaussians::forward(AutogradContext *ctx, 
                 torch::Tensor means,

diff --git a/project_gaussians.hpp b/project_gaussians.hpp
@@ -7,7 +7,7 @@
 
 using namespace torch::autograd;
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 class ProjectGaussians : public Function<ProjectGaussians>{
 public:

diff --git a/rasterize_gaussians.cpp b/rasterize_gaussians.cpp
@@ -1,7 +1,7 @@
 #include "rasterize_gaussians.hpp"
 #include "gsplat.hpp"
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 std::tuple<torch::Tensor,
         torch::Tensor,
@@ -171,9 +171,9 @@ torch::Tensor RasterizeGaussiansCPU::forward(AutogradContext *ctx,
     torch::Tensor finalTs = std::get<1>(t);
     std::vector<int32_t> *px2gid = std::get<2>(t);
 
+    ctx->saved_data["px2gid"] = reinterpret_cast<int64_t>(px2gid);
     ctx->saved_data["imgWidth"] = imgWidth;
     ctx->saved_data["imgHeight"] = imgHeight;
-    ctx->saved_data["px2gid"] = reinterpret_cast<int64_t>(px2gid);
     ctx->save_for_backward({ xys, conics, colors, opacity, background, cov2d, camDepths, finalTs });
 
     return outImg;

diff --git a/rasterize_gaussians.hpp b/rasterize_gaussians.hpp
@@ -6,7 +6,7 @@
 
 using namespace torch::autograd;
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 std::tuple<torch::Tensor,
         torch::Tensor,

diff --git a/simple_trainer.cpp b/simple_trainer.cpp
@@ -60,6 +60,9 @@ int main(int argc, char **argv){
     if (torch::cuda::is_available() && result.count("cpu") == 0){
         std::cout << "Using CUDA" << std::endl;
         device = torch::kCUDA;
+    }else if(torch::mps::is_available() && result.count("cpu") == 0){
+        std::cout << "Using MPS" << std::endl;
+        device = torch::kMPS;
     }else{
         std::cout << "Using CPU" << std::endl;
     }
@@ -160,7 +163,7 @@ int main(int argc, char **argv){
                 width,
                 background);
         }else{
-            #if defined(USE_HIP) || defined(USE_CUDA)
+            #if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
                 auto p = ProjectGaussians::apply(means, scales, 1, 
                                         quats, viewMat, viewMat,
                                         focal, focal,

diff --git a/spherical_harmonics.cpp b/spherical_harmonics.cpp
@@ -27,7 +27,7 @@ torch::Tensor sh2rgb(const torch::Tensor &sh){
     return (sh * C0) + 0.5;
 }
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 torch::Tensor SphericalHarmonics::forward(AutogradContext *ctx, 
             int degreesToUse, 

diff --git a/spherical_harmonics.hpp b/spherical_harmonics.hpp
@@ -10,7 +10,7 @@ int degFromSh(int numBases);
 torch::Tensor rgb2sh(const torch::Tensor &rgb);
 torch::Tensor sh2rgb(const torch::Tensor &sh);
 
-#if defined(USE_HIP) || defined(USE_CUDA)
+#if defined(USE_HIP) || defined(USE_CUDA) || defined(USE_MPS)
 
 class SphericalHarmonics : public Function<SphericalHarmonics>{
 public: