haosulab · astonastonaston · Jul 9, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ _ext
 build/
 dist/
 *.egg-info/
+demos/
 
 # ipython/jupyter notebooks
 # *.ipynb

diff --git a/docs/source/user_guide/concepts/images/voxel_cam_view_all.png b/docs/source/user_guide/concepts/images/voxel_cam_view_all.png
diff --git a/docs/source/user_guide/concepts/images/voxel_cam_view_one.png b/docs/source/user_guide/concepts/images/voxel_cam_view_one.png
diff --git a/docs/source/user_guide/concepts/images/voxel_pushcube.png b/docs/source/user_guide/concepts/images/voxel_pushcube.png
diff --git a/docs/source/user_guide/concepts/images/voxel_pushcube_complete.png b/docs/source/user_guide/concepts/images/voxel_pushcube_complete.png
diff --git a/docs/source/user_guide/concepts/observation.md b/docs/source/user_guide/concepts/observation.md
@@ -93,6 +93,106 @@ For a quick demo to visualize pointclouds, you can run
 python -m mani_skill.examples.demo_vis_pcd -e "PushCube-v1"
 ```
 
+### voxel
+
+#### Keyword arguments
+This observation mode has the same data format as the [sensor_data mode](#sensor_data), but all sensor data from cameras are removed and instead a new key is added called `voxel_grid`.
+
+To use this observation mode, a dictionary of observation config parameters is required via obs_mode_config during environment initializations (gym.make()). It contains the following keys:
+
+- `coord_bounds`: `[torch.float32, torch.float32, torch.float32, torch.float32, torch.float32, torch.float32]` It has form **[x_min, y_min, z_min, x_max, y_max, z_max]**  defining the metric volume to be voxelized.
+- `voxel_size`: `torch.int` Defining the side length of each voxel, assuming that all voxels are cubic.
+- `device`: `torch.device` The device on which the voxelization takes place. Something like **torch.device("cuda" if torch.cuda.is_available() else "cpu")**
+- `segmentation`: `bool` Defining whether or not to estimate voxel segmentations using the point cloud segmentations. If true then num_channels=11 (including one channel for voxel segmentation), otherwise num_channels=10.
+
+Some tabletop tasks with simple scene like PushCube-v1 have default voxel observation mode configs available. In this case, you can easily initialize the environment via:
+
+```python
+gym.make("PushCube-v1", obs_mode="voxel")
+```
+
+You can override the default observation mode configs easily by just passing the obs_mode_config keyword arguments following the above dictionary formats:
+
+```python
+env = gym.make("PushCube-v1", obs_mode="voxel", obs_mode_config={"coord_bounds":..., "voxel_size":..., "device":..., "segmentation":...})
+```
+
+For those tasks with non-trivial scene like TwoRobotStackCube-v1, you will need one (or even more) camera with larger width and height to film the complete scene. In this case, you need to pass in customized obs_mode_config keyword arguments while using the voxel observation mode instead of relying on the default voxel observation configs.
+
+
+#### Voxel grid structure
+
+Then, as you step throught the environment you created and get observations, you can see the extra key `voxel_grid` indicating the voxel grid generated:
+
+
+- `voxel_grid`: `[torch.int, torch.int, torch.int, torch.int, torch.int]` It has form **[N, voxel_size, voxel_size, voxel_size, num_channels]**. Voxel grids generated by fusing the point cloud and rgb data from all cameras. `N` is the batch size. `voxel_size` is the side length of the voxel, as indicated in voxelization configs. `num_channels` indicates the number of feature channels for each voxel. 
+
+- The voxel features have at most 11 channels: 
+>- `voxel_grid[..., :3]`: Average xyz coordinates of the points falling into the voxel
+>- `voxel_grid[..., 3:6]`: Average RGB of the points falling into the voxel
+>- `voxel_grid[..., 6:9]`: Voxel xyz indices
+>- `voxel_grid[..., 9]`: if `segmentation=True`, then this channel would be the voxel segmentation id. It is computed by counting the majority class of the points falling into the voxel. If `segmentation=False`, then this channel would be the occupancy value indicating whether this voxel has any point falling in.
+>- `voxel_grid[..., 10]`: If `segmentation=True`, this channel would be the occupancy value indicating whether this voxel has any point falling in, and hence there will be in total 11 channels. If `segmentation=False`, then this channel doesn't exist, and hence there will only be 10 channels in total.
+
+
+The voxel grid can be visualized below. This is an image showing the voxelized scene of PushCube-v1 with slightly-tuned default hyperparameters. The voxel grid is reconstructed from the front camera, following the default camera settings of the PuchCube-v1 task, and hence it only contains the front voxels instead of the voxels throughout the scene.
+
+```{image} images/voxel_pushcube.png
+---
+alt: Voxelized PushCube-v1 scene at the initial state
+---
+```
+
+The RGBD image data used to reconstruct the voxel scene above is shown in the following figure. Here we use only one base camera in PushCube-v1 task.
+
+```{image} images/voxel_cam_view_one.png
+---
+alt: Corresponding RGBD observations
+---
+```
+
+#### Visualization demos
+
+For a quick demo to visualize voxel grids, you can run
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" --voxel-size 200 --zoom-factor 2 --coord-bounds -1 -1 -1 2 2 2 --segmentation --device cuda
+```
+
+to override the default observation mode config kwargs. Or simply 
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" 
+```
+
+When using the default kwarg settings to override the default task obs_mode_config settings. 
+
+If you don't want any override to occur and only want to use the default task obs_mode_config settings (defined in the **_default_voxel_config** property of each task), you can run
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" --use-default True
+```
+
+Note that this only works for some tabletop tasks like PickCube-v1 and PushCube-v1. You still need to tune those configs for your own task and use your customized configs when needed.
+
+Furthermore, if you use more sensors (currently only RGB and depth cameras) to film the scene and collect more point cloud and RGB data from different poses, you can get a more accurate voxel grid reconstruction of the scene. Figure below gives a more completely reconstructed voxel scene of PushCube-v1 using more RGBD cameras.
+
+```{image} images/voxel_pushcube_complete.png
+---
+alt: Densely voxelized PushCube-v1 scene at the initial state
+---
+```
+
+It is reconstructed using 5 cameras located at the up, left, right, front, and back of the tabletop scene, respectively, as shown in the visualized RGBD observations below.
+
+```{image} images/voxel_cam_view_all.png
+---
+alt: Corresponding RGBD observations
+---
+```
+
+
+
 ## Segmentation Data
 
 Objects upon being loaded are automatically assigned a segmentation ID (the `per_scene_id` attribute of `sapien.Entity` objects). To get information about which IDs refer to which Actors / Links, you can run the code below
@@ -114,4 +214,4 @@ Note that ID 0 refers to the distant background. For a quick demo of this, you c
 ```bash
 python -m mani_skill.examples.demo_vis_segmentation -e "PushCube-v1" # plot all segmentations
 python -m mani_skill.examples.demo_vis_segmentation -e "PushCube-v1" --id cube # mask everything but the object with name "cube" 
-```
+```
diff --git a/docs/source/user_guide/demos/images/voxel_pushcube.png b/docs/source/user_guide/demos/images/voxel_pushcube.png
diff --git a/docs/source/user_guide/demos/index.md b/docs/source/user_guide/demos/index.md
@@ -194,6 +194,19 @@ python -m mani_skill.examples.demo_vis_rgbd -e "StackCube-v1"
 ```{figure}  images/rgbd_vis.png
 ```
 
+## Visualize Voxel Data
+
+You can run the following to visualize the voxelized data. It will give you the following voxelized scene under the default sensor settings with only 1 camera at the front of the scene.
+
+```bash
+python -m mani_skill.examples.demo_vis_voxel -e "PushCube-v1" 
+```
+
+
+```{figure}  images/voxel_pushcube.png
+```
+
+
 ## Visualize Reset Distributions
 
 Determining how difficult a task might be for ML algorithms like reinforcement learning and imitation learning can heavily depend on the reset distribution of the task. To see what the reset distribution of any task (the result of repeated env.reset calls) looks like you can run the following to save a video to the `videos` folder

diff --git a/mani_skill/envs/sapien_env.py b/mani_skill/envs/sapien_env.py
@@ -23,6 +23,7 @@
 from mani_skill.envs.utils.observations import (
     parse_visual_obs_mode_to_struct,
     sensor_data_to_pointcloud,
+    sensor_data_to_voxel,
 )
 from mani_skill.sensors.base_sensor import BaseSensor, BaseSensorConfig
 from mani_skill.sensors.camera import (
@@ -78,6 +79,10 @@ class BaseEnv(gym.Env):
 
         human_render_camera_configs (dict): configurations of human rendering cameras to override any environment defaults. Similar usage as @sensor_configs.
 
+        obs_mode_config (dict): configuration hyperparameters of observations. See notes for more details.
+
+        human_render_camera_cfgs (dict): configurations of human rendering cameras. Similar usage as @sensor_cfgs.
+
         viewer_camera_configs (dict): configurations of the viewer camera in the GUI to override any environment defaults. Similar usage as @sensor_configs.
 
         robot_uids (Union[str, BaseAgent, List[Union[str, BaseAgent]]]): List of robots to instantiate and control in the environment.
@@ -111,7 +116,8 @@ class BaseEnv(gym.Env):
     SUPPORTED_ROBOTS: List[Union[str, Tuple[str]]] = None
     """Override this to enforce which robots or tuples of robots together are supported in the task. During env creation,
     setting robot_uids auto loads all desired robots into the scene, but not all tasks are designed to support some robot setups"""
-    SUPPORTED_OBS_MODES = ("state", "state_dict", "none", "sensor_data", "rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "rgb+segmentation", "depth+segmentation", "pointcloud")
+
+    SUPPORTED_OBS_MODES = ("state", "state_dict", "none", "sensor_data", "rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "rgb+segmentation", "depth+segmentation", "pointcloud", "voxel")
     SUPPORTED_REWARD_MODES = ("normalized_dense", "dense", "sparse", "none")
     SUPPORTED_RENDER_MODES = ("human", "rgb_array", "sensors", "all")
     """The supported render modes. Human opens up a GUI viewer. rgb_array returns an rgb array showing the current environment state.
@@ -135,6 +141,8 @@ class BaseEnv(gym.Env):
     """all sensor configurations parsed from self._sensor_configs and agent._sensor_configs"""
     _agent_sensor_configs: Dict[str, BaseSensorConfig]
     """all agent sensor configs parsed from agent._sensor_configs"""
+    _obs_mode_config: Dict
+    """configurations for converting sensor data to observations under the current observation mode (e.g. voxel size and scene bounds for voxel observations)"""
     _human_render_cameras: Dict[str, Camera]
     """cameras used for rendering the current environment retrievable via `env.render_rgb_array()`. These are not used to generate observations"""
     _default_human_render_camera_configs: Dict[str, CameraConfig]
@@ -173,6 +181,7 @@ def __init__(
         enable_shadow: bool = False,
         sensor_configs: Optional[dict] = dict(),
         human_render_camera_configs: Optional[dict] = dict(),
+        obs_mode_config: dict = None,
         viewer_camera_configs: Optional[dict] = dict(),
         robot_uids: Union[str, BaseAgent, List[Union[str, BaseAgent]]] = None,
         sim_config: Union[SimConfig, dict] = dict(),
@@ -195,6 +204,12 @@ def __init__(
         self._custom_viewer_camera_configs = viewer_camera_configs
         self._parallel_in_single_scene = parallel_in_single_scene
         self.robot_uids = robot_uids
+
+        if obs_mode_config is None:
+            self._obs_mode_config = self._default_voxel_config
+        else:
+            self._obs_mode_config = obs_mode_config
+
         if self.SUPPORTED_ROBOTS is not None:
             if robot_uids not in self.SUPPORTED_ROBOTS:
                 logger.warn(f"{robot_uids} is not in the task's list of supported robots. Code may not run as intended")
@@ -486,6 +501,15 @@ def get_obs(self, info: Optional[Dict] = None):
             obs = self._get_obs_with_sensor_data(info, apply_texture_transforms=False)
         elif self._obs_mode in ["rgb", "depth", "segmentation", "rgbd", "rgb+depth", "rgb+depth+segmentation", "depth+segmentation", "rgb+segmentation"]:
             obs = self._get_obs_with_sensor_data(info)
+        elif self._obs_mode == "voxel":
+            # assert on _obs_mode_config here, and pass them to the convertion function
+            assert self._obs_mode_config != None, "You mush pass in configs in voxel observation mode via obs_mode_config keyword arg in gym.make(). See the Maniskill docs for details. No such config detected."
+            assert "voxel_size" in self._obs_mode_config.keys(), "Lacking voxel_size (voxel size) in observation configs"
+            assert "coord_bounds" in self._obs_mode_config.keys(), "Lacking coord_bounds (coordinate bounds) in observation configs"
+            assert "device" in self._obs_mode_config.keys(), "Lacking device (device for voxelizations) in observation configs"
+            assert "segmentation" in self._obs_mode_config.keys(), "Lacking segmentation (a boolean indicating whether including voxel segmentations) in observation configs"
+            obs = self._get_obs_with_sensor_data(info)
+            obs = sensor_data_to_voxel(obs, self._sensors, self._obs_mode_config)
         else:
             raise NotImplementedError(self._obs_mode)
         return obs

diff --git a/mani_skill/envs/tasks/control/cartpole.py b/mani_skill/envs/tasks/control/cartpole.py
@@ -105,7 +105,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at(eye=[0, -4, 1], target=[0, 0, 1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         loader = self.scene.create_mjcf_loader()
         articulation_builders, actor_builders, sensor_configs = loader.parse(MJCF_FILE)

diff --git a/mani_skill/envs/tasks/fmb/fmb.py b/mani_skill/envs/tasks/fmb/fmb.py
@@ -62,7 +62,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([1.0, 0.8, 0.8], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 1024, 1024, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise

diff --git a/mani_skill/envs/tasks/humanoid/humanoid_stand.py b/mani_skill/envs/tasks/humanoid/humanoid_stand.py
@@ -34,7 +34,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([1.0, 1.0, 2.5], [0.0, 0.0, 0.75])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         build_ground(self.scene)
 

diff --git a/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py b/mani_skill/envs/tasks/mobile_manipulation/open_cabinet_drawer.py
@@ -87,7 +87,7 @@ def _default_human_render_camera_configs(self):
         return CameraConfig(
             "render_camera", pose=pose, width=512, height=512, fov=1, near=0.01, far=100
         )
-
+    
     def _load_scene(self, options: dict):
         self.ground = build_ground(self.scene)
         # temporarily turn off the logging as there will be big red warnings

diff --git a/mani_skill/envs/tasks/tabletop/assembling_kits.py b/mani_skill/envs/tasks/tabletop/assembling_kits.py
@@ -80,6 +80,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.3, 0.3, 0.8], [0.0, 0.0, 0.1])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     def _load_scene(self, options: dict):
         with torch.device(self.device):
             self.table_scene = TableSceneBuilder(self)

diff --git a/mani_skill/envs/tasks/tabletop/lift_peg_upright.py b/mani_skill/envs/tasks/tabletop/lift_peg_upright.py
@@ -39,6 +39,13 @@ def _default_human_render_camera_configs(self):
         pose = look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             env=self, robot_init_qpos_noise=self.robot_init_qpos_noise

diff --git a/mani_skill/envs/tasks/tabletop/peg_insertion_side.py b/mani_skill/envs/tasks/tabletop/peg_insertion_side.py
@@ -89,6 +89,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.5, -0.5, 0.8], [0.05, -0.1, 0.4])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     def _load_scene(self, options: dict):
         with torch.device(self.device):
             self.table_scene = TableSceneBuilder(self)

diff --git a/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py b/mani_skill/envs/tasks/tabletop/pick_clutter_ycb.py
@@ -73,6 +73,13 @@ def _default_sim_config(self):
             )
         )
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     @property
     def _default_sensor_configs(self):
         pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])

diff --git a/mani_skill/envs/tasks/tabletop/pick_cube.py b/mani_skill/envs/tasks/tabletop/pick_cube.py
@@ -36,6 +36,13 @@ def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
 
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     def _load_scene(self, options: dict):
         self.table_scene = TableSceneBuilder(
             self, robot_init_qpos_noise=self.robot_init_qpos_noise

diff --git a/mani_skill/envs/tasks/tabletop/pick_single_ycb.py b/mani_skill/envs/tasks/tabletop/pick_single_ycb.py
@@ -69,7 +69,7 @@ def _default_sensor_configs(self):
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, 0.7, 0.6], [0.0, 0.0, 0.35])
         return CameraConfig("render_camera", pose, 512, 512, 1, 0.01, 100)
-
+    
     def _load_scene(self, options: dict):
         global WARNED_ONCE
         self.table_scene = TableSceneBuilder(

diff --git a/mani_skill/envs/tasks/tabletop/place_sphere.py b/mani_skill/envs/tasks/tabletop/place_sphere.py
@@ -70,26 +70,24 @@ def _default_sim_config(self):
 
     @property
     def _default_sensor_configs(self):
-        pose = sapien_utils.look_at(eye=[0.3, 0, 0.2], target=[-0.1, 0, 0])
-        return [
-            CameraConfig(
-                "base_camera",
-                pose=pose,
-                width=128,
-                height=128,
-                fov=np.pi / 2,
-                near=0.01,
-                far=100,
-            )
-        ]
+        pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])
+        return [CameraConfig("base_camera", pose, 128, 128, np.pi / 2, 0.01, 100)]
+
 
     @property
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.6, -0.2, 0.2], [0.0, 0.0, 0.2])
         return CameraConfig(
             "render_camera", pose=pose, width=512, height=512, fov=1, near=0.01, far=100
         )
-
+
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     def _build_bin(self, radius):
         builder = self.scene.create_actor_builder()
 

diff --git a/mani_skill/envs/tasks/tabletop/plug_charger.py b/mani_skill/envs/tasks/tabletop/plug_charger.py
@@ -44,7 +44,14 @@ def _default_sensor_configs(self):
         return [
             CameraConfig("base_camera", pose=pose, width=128, height=128, fov=np.pi / 2)
         ]
-
+
+    @property
+    def _default_voxel_config(self):
+        return {"coord_bounds": [-1, -1, -1, 2, 2, 2],
+                    "voxel_size": 200, 
+                    "device": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
+                    "segmentation": True}
+
     @property
     def _default_human_render_camera_configs(self):
         pose = sapien_utils.look_at([0.3, 0.4, 0.1], [0, 0, 0])