Skip to content

Commit

Permalink
chore: Update error returned by cloudprovider.Create() (#7385)
Browse files Browse the repository at this point in the history
  • Loading branch information
jigisha620 authored Nov 15, 2024
1 parent 992689c commit ea2254d
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 14 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8
sigs.k8s.io/controller-runtime v0.19.1
sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2
sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe
sigs.k8s.io/yaml v1.4.0
)

Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,8 @@ sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn
sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2 h1:wnXbS7okpGz7RHFrnjJS4r7BfGwCCcOaGO8naB1+thw=
sigs.k8s.io/karpenter v1.0.1-0.20241112233246-3e0c51ac84f2/go.mod h1:RDaWii2JY4Qvnc99/UBjPzYfk/yfGQV4ihpk34BX2EQ=
sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe h1:OEIvm8hg0wQXtAC5pxuWnlbSgdcDGO+Mes8H7W7Cv4s=
sigs.k8s.io/karpenter v1.0.1-0.20241115002651-7786f76f87fe/go.mod h1:RDaWii2JY4Qvnc99/UBjPzYfk/yfGQV4ihpk34BX2EQ=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
Expand Down
11 changes: 8 additions & 3 deletions pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,23 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
return nil, cloudprovider.NewNodeClassNotReadyError(stderrors.New(nodeClassReady.Message))
}
if nodeClassReady.IsUnknown() {
return nil, fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message)
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message), "NodeClass is in Ready=Unknown")
}
instanceTypes, err := c.resolveInstanceTypes(ctx, nodeClaim, nodeClass)
if err != nil {
return nil, fmt.Errorf("resolving instance types, %w", err)
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "Error resolving instance types")
}
if len(instanceTypes) == 0 {
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
}
instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, getTags(ctx, nodeClass, nodeClaim), instanceTypes)
if err != nil {
return nil, fmt.Errorf("creating instance, %w", err)
conditionMessage := "Error creating instance"
var createError *cloudprovider.CreateError
if stderrors.As(err, &createError) {
conditionMessage = createError.ConditionMessage
}
return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance, %w", err), conditionMessage)
}
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
return i.Name == string(instance.Type)
Expand Down
15 changes: 7 additions & 8 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass
}
instanceTypes, err := cloudprovider.InstanceTypes(instanceTypes).Truncate(schedulingRequirements, maxInstanceTypes)
if err != nil {
log.FromContext(ctx).Error(err, "truncating instance types")
return nil, fmt.Errorf("truncating instance types, %w", err)
return nil, cloudprovider.NewCreateError(fmt.Errorf("truncating instance types, %w", err), "Error truncating instance types based on the passed-in requirements")
}
fleetInstance, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags)
if awserrors.IsLaunchTemplateNotFound(err) {
Expand All @@ -115,7 +114,6 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass
fleetInstance, err = p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags)
}
if err != nil {
log.FromContext(ctx).Error(err, "launching instance")
return nil, err
}
efaEnabled := lo.Contains(lo.Keys(nodeClaim.Spec.Resources.Requests), v1.ResourceEFA)
Expand Down Expand Up @@ -213,13 +211,13 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N
capacityType := p.getCapacityType(nodeClaim, instanceTypes)
zonalSubnets, err := p.subnetProvider.ZonalSubnetsForLaunch(ctx, nodeClass, instanceTypes, capacityType)
if err != nil {
return ec2types.CreateFleetInstance{}, fmt.Errorf("getting subnets, %w", err)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting subnets, %w", err), "Error getting subnets")
}

// Get Launch Template Configs, which may differ due to GPU or Architecture requirements
launchTemplateConfigs, err := p.getLaunchTemplateConfigs(ctx, nodeClass, nodeClaim, instanceTypes, zonalSubnets, capacityType, tags)
if err != nil {
return ec2types.CreateFleetInstance{}, fmt.Errorf("getting launch template configs, %w", err)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting launch template configs, %w", err), "Error getting launch template configs")
}
if err := p.checkODFallback(nodeClaim, instanceTypes, launchTemplateConfigs); err != nil {
log.FromContext(ctx).Error(err, "failed while checking on-demand fallback")
Expand Down Expand Up @@ -248,6 +246,7 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N
createFleetOutput, err := p.ec2Batcher.CreateFleet(ctx, createFleetInput)
p.subnetProvider.UpdateInflightIPs(createFleetInput, createFleetOutput, instanceTypes, lo.Values(zonalSubnets), capacityType)
if err != nil {
conditionMessage := "Error creating fleet"
if awserrors.IsLaunchTemplateNotFound(err) {
for _, lt := range launchTemplateConfigs {
p.launchTemplateProvider.InvalidateCache(ctx, aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateName), aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateId))
Expand All @@ -256,9 +255,9 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N
}
var reqErr *awshttp.ResponseError
if errors.As(err, &reqErr) {
return ec2types.CreateFleetInstance{}, fmt.Errorf("creating fleet %w (%v)", err, reqErr.ServiceRequestID())
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w (%v)", err, reqErr.ServiceRequestID()), conditionMessage)
}
return ec2types.CreateFleetInstance{}, fmt.Errorf("creating fleet %w", err)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w", err), conditionMessage)
}
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType)
if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 {
Expand Down Expand Up @@ -503,5 +502,5 @@ func combineFleetErrors(fleetErrs []ec2types.CreateFleetError) (errs error) {
if iceErrorCount == len(fleetErrs) {
return cloudprovider.NewInsufficientCapacityError(fmt.Errorf("with fleet error(s), %w", errs))
}
return fmt.Errorf("with fleet error(s), %w", errs)
return cloudprovider.NewCreateError(errs, "Error creating fleet")
}

0 comments on commit ea2254d

Please sign in to comment.