From 13fbabb32e4dab633b63c44cc544248d347115de Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <nelyahu@habana.ai>
Date: Thu, 8 Feb 2024 10:40:29 +0200
Subject: [PATCH 1/3] estimate_zero2_model_states_mem_needs: fixing memory
 estiamtion

was considering 4 bytes per model param, and 4 bytes per gradient.
fixed it to 2 bytes - under the assumption of FP16/BF16
---
 deepspeed/runtime/zero/stage_1_and_2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 56607b349ae7..5b3beac4adac 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -2425,7 +2425,9 @@ def estimate_zero2_model_states_mem_needs(total_params,
         gpu_mem = 2 * total_params
         cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor
     else:
-        gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
+        # GPU's total_params multipliers: 2 = params_16bit,
+        # 14 = 2_grads_16bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
+        gpu_mem = 2 * total_params + int(14 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
 
     return int(cpu_mem), int(gpu_mem)

From 3b7a05338db748736becf9bbf16c692404fee85a Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com>
Date: Mon, 22 Apr 2024 10:08:04 +0300
Subject: [PATCH 2/3] Update deepspeed/runtime/zero/stage_1_and_2.py

Applying code review suggestion to have more close estimation of memory consumption

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/stage_1_and_2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 587be9889b52..fbe780e99ada 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -2429,7 +2429,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
     else:
         # GPU's total_params multipliers: 2 = params_16bit,
         # 14 = 2_grads_16bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
-        gpu_mem = 2 * total_params + int(14 * total_params / total_gpus)
+        gpu_mem = 2 * total_params + int(18 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
 
     return int(cpu_mem), int(gpu_mem)

From 9b1b7e0bb695c6fb14027e4b257bf2964d52e96b Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com>
Date: Mon, 22 Apr 2024 10:09:14 +0300
Subject: [PATCH 3/3] Update deepspeed/runtime/zero/stage_1_and_2.py

update comment

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/zero/stage_1_and_2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index fbe780e99ada..123f5724223b 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -2428,7 +2428,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
         cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor
     else:
         # GPU's total_params multipliers: 2 = params_16bit,
-        # 14 = 2_grads_16bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
+        # 18 = 2_grads_16bit + 4_grads_32bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
         gpu_mem = 2 * total_params + int(18 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor