Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean all param coordinators #6661

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions deepspeed/runtime/zero/parameter_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ def setup_zero_stage3_hooks(self):
@instrument_w_nvtx
def _end_of_forward_hook(module, *args):

if not torch._C.is_grad_enabled():
self.get_param_coordinator(training=False).reset_step()
self.get_param_coordinator(training=False).reset_step()
self.get_param_coordinator(training=True).reset_step()

#likely one of them should be enough but just to be safe
self._register_hooks_recursively(self.module)
Expand Down Expand Up @@ -463,7 +463,7 @@ def post_sub_module_forward_function(self, sub_module):

@torch.no_grad()
def pre_sub_module_backward_function(self, sub_module):
assert sub_module.training, "backward pass is invalid for module in evaluation mode"
# assert sub_module.training, "backward pass is invalid for module in evaluation mode"
param_coordinator = self.get_param_coordinator(training=True)
param_coordinator.trace_prologue(sub_module)
if param_coordinator.is_record_trace():
Expand All @@ -472,7 +472,7 @@ def pre_sub_module_backward_function(self, sub_module):

@torch.no_grad()
def post_sub_module_backward_function(self, sub_module):
assert sub_module.training, "backward pass is invalid for module in evaluation mode"
# assert sub_module.training, "backward pass is invalid for module in evaluation mode"
see_memory_usage(
f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
force=False)
Expand Down
16 changes: 16 additions & 0 deletions deepspeed/utils/tensor_fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,22 @@ def safe_set_local_grad(param, value):
param._z3_optimizer.set_local_grad_for_param(value, param)


def safe_set_local_grad(param, value):
"""Update the gradient of a partitioned parameter.
Args:
param (``torch.nn.Parameter``): A model parameter
value (``torch.Tensor``): New value
"""
if param.grad is not None:
return param.grad.copy_(value)

# ZeRO stage 3 param
if hasattr(param, 'ds_id'):
return param._z3_optimizer.set_local_grad_for_param(value, param)

return None


def safe_get_local_fp32_param(param):
"""Get the local partition of a ZeRO-3 partitioned parameter in fp32 precision.
Args:
Expand Down
Loading