Farama-Foundation · rradules · Jun 18, 2024 · May 29, 2024 · May 29, 2024 · May 29, 2024
diff --git a/momaland/envs/beach/beach.py b/momaland/envs/beach/beach.py
@@ -5,6 +5,7 @@
 
 import functools
 import random
+import warnings
 from typing_extensions import override
 
 import numpy as np
@@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
     The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place.
 
     ## Reward Space
-    The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for:
+    The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for:
     - the occupation level
     - the mixture level
-    If the scheme is 'local', the reward is given for the currently occupied section.
-    If the scheme is 'global', the reward is summed over all sections.
+    If the mode is 'individual', the reward is given for the currently occupied section.
+    If the mode is 'team', the reward is summed over all sections.
 
     ## Starting State
     The initial position is a uniform random distribution of agents over the sections. This can be changed via the
@@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
     ## Arguments
     - 'num_timesteps (int)': number of timesteps in the domain. Default: 1
     - 'num_agents (int)': number of agents in the domain. Default: 100
-    - 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local
+    - 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual
     - 'sections (int)': number of beach sections in the domain. Default: 6
     - 'capacity (int)': capacity of each beach section. Default: 7
     - 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7).
@@ -104,7 +105,7 @@ def __init__(
         self,
         num_timesteps=1,
         num_agents=100,
-        reward_scheme="local",
+        reward_mode="individual",
         sections=6,
         capacity=7,
         type_distribution=(0.3, 0.7),
@@ -117,26 +118,29 @@ def __init__(
             sections: number of beach sections in the domain
             capacity: capacity of each beach section
             num_agents: number of agents in the domain
+            reward_mode: the reward mode to use ('individual', or 'team'). Default: individual
             type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed.
             position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections.
             num_timesteps: number of timesteps in the domain
             render_mode: render mode
-            reward_scheme: the reward scheme to use ('local', or 'global'). Default: local
         """
         EzPickle.__init__(
             self,
             num_timesteps,
             num_agents,
-            reward_scheme,
+            reward_mode,
             sections,
             capacity,
             type_distribution,
             position_distribution,
             render_mode,
         )
-        self.reward_scheme = reward_scheme
+        if reward_mode not in ["individual", "team"]:
+            self.reward_mode = "individual"
+            warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.")
+        else:
+            self.reward_mode = reward_mode
         self.sections = sections
-        # TODO Extend to distinct capacities per section?
         self.resource_capacities = [capacity for _ in range(sections)]
         self.num_timesteps = num_timesteps
         self.episode_num = 0
@@ -296,13 +300,13 @@ def step(self, actions):
         reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32)
 
         if env_termination:
-            if self.reward_scheme == "local":
+            if self.reward_mode == "individual":
                 for i in range(self.sections):
                     lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i])
                     lr_mixture = _local_mixture_reward(section_agent_types[i])
                     reward_per_section[i] = np.array([lr_capacity, lr_mixture])
 
-            elif self.reward_scheme == "global":
+            elif self.reward_mode == "team":
                 g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
                 g_mixture = _global_mixture_reward(section_agent_types)
                 reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections)

diff --git a/momaland/envs/item_gathering/item_gathering.py b/momaland/envs/item_gathering/item_gathering.py
@@ -6,18 +6,19 @@
 
 Notes:
     - In contrast to the original environment, the observation space is a 2D array of integers, i.e.,
-    the map of the environment, where each integer represents either agents (1 for the agent receiving the observation,
-     2 for the other agents) or items (3, 4, etc., depending on the number of items).
+    the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items.
     - The number of agents and items is configurable, by providing an initial map.
     - If no initial map is provided, the environment uses a default map
 
 Central observation:
-    - If the central_observation flag is set to True, then the environment implements:
+    - If the central_observation flag is set to True, then the environment includes in the implementation:
         - a central observation space: self.central_observation_space
         - a central observation function: self.state()
+    The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper
 """
 
 import random
+import warnings
 from copy import deepcopy
 from os import path
 from typing_extensions import override
@@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle):
     - 'num_timesteps': number of timesteps to run the environment for. Default: 10
     - 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019)
     - 'randomise': whether to randomise the map, at each episode. Default: False
+    - 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual'
     - 'render_mode': render mode for the environment. Default: None
     """
 
@@ -118,6 +120,7 @@ def __init__(
         num_timesteps=10,
         initial_map=DEFAULT_MAP,
         randomise=False,
+        reward_mode="individual",
         render_mode=None,
     ):
         """Initializes the item gathering domain.
@@ -126,19 +129,26 @@ def __init__(
             num_timesteps: number of timesteps to run the environment for
             initial_map: map of the environment
             randomise: whether to randomise the map, at each episode
+            reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual'
             render_mode: render mode for the environment
         """
         EzPickle.__init__(
             self,
             num_timesteps,
             initial_map,
             randomise,
+            reward_mode,
             render_mode,
         )
         self.num_timesteps = num_timesteps
         self.current_timestep = 0
         self.render_mode = render_mode
         self.randomise = randomise
+        if reward_mode not in ["individual", "team"]:
+            self.reward_mode = "individual"
+            warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.")
+        else:
+            self.reward_mode = reward_mode
 
         # check if the initial map has any entries equal to 1
         assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)."
@@ -391,6 +401,9 @@ def step(self, actions):
             if value_in_cell > 0:
                 rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1
                 self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0
+        # if reward mode is teams, sum the rewards for all agents
+        if self.reward_mode == "team":
+            rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents}
 
         map_obs = self.state()
         observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)}

diff --git a/momaland/learning/morl/random_centralised_agent_example.py b/momaland/learning/morl/random_centralised_agent_example.py
@@ -44,6 +44,7 @@ def train_random(moma_env):
         num_timesteps=50,
         initial_map=test_map,
         randomise=True,
+        reward_mode="test",
         render_mode=None,
     )
 
@@ -57,8 +58,8 @@ def train_random(moma_env):
         reward_scheme="local",
     )
 
-    # train_random(ig_env)
+    train_random(ig_env)
     # train_random(mobpd_env)
 
     # train_sa_random(ig_env)
-    train_sa_random(mobpd_env)
+    # train_sa_random(mobpd_env)
diff --git a/momaland/learning/morl/train_bpd_GPILS.py b/momaland/learning/morl/train_bpd_GPILS.py
diff --git a/momaland/learning/morl/train_bpd_PCN.py b/momaland/learning/morl/train_bpd_PCN.py
diff --git a/momaland/learning/morl/train_ig_GPILS.py b/momaland/learning/morl/train_ig_GPILS.py
@@ -43,7 +43,7 @@
         gradient_updates=10,
         target_net_update_freq=200,
         tau=1,
-        log=True,
+        log=False,  # set this to True to turn on wandb logging
         project_name=project_name,
         seed=seed,
     )

diff --git a/momaland/learning/morl/train_ig_PCN.py b/momaland/learning/morl/train_ig_PCN.py
@@ -40,7 +40,7 @@
         batch_size=256,
         project_name=project_name,
         experiment_name="PCN",
-        log=True,
+        log=False,  # set this to True to turn on wandb logging
     )
     timesteps_per_iter = 10000
     agent.train(