Skip to content

Commit

Permalink
feat: add config variable POD_TERMINATION_GRACE_PERIOD (#29)
Browse files Browse the repository at this point in the history
Fixes #28
  • Loading branch information
LaCodon authored Sep 18, 2022
1 parent fc91d4a commit 8f0207a
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 43 deletions.
27 changes: 14 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,20 @@ Therefore, this application will not run into any issues if it is restarted, res

## Usage

| Environment variable | Description | Required | Default |
|:-------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|:------------|
| CLUSTER_NAME | Name of the eks-cluster, used in place of `AUTODISCOVERRY_TAGS` and `AUTO_SCALING_GROUP_NAMES`. Checks for `k8s.io/cluster-autoscaler/<CLUSTER_NAME>: owned` and `k8s.io/cluster-autoscaler/enabled: true` tags on ASG | yes | `""` |
| AUTODISCOVERY_TAGS | Comma separated key value string with tags to autodiscover ASGs, used in place of `CLUSTER_NAME` and `AUTO_SCALING_GROUP_NAMES`. | yes | `""` |
| AUTO_SCALING_GROUP_NAMES | Comma-separated list of ASGs, CLUSTER_NAME takes priority. | yes | `""` |
| IGNORE_DAEMON_SETS | Whether to ignore DaemonSets when draining the nodes | no | `true` |
| DELETE_EMPTY_DIR_DATA | Whether to delete empty dir data when draining the nodes | no | `true` |
| AWS_REGION | Self-explanatory | no | `us-west-2` |
| ENVIRONMENT | If set to `dev`, will try to create the Kubernetes client using your local kubeconfig. Any other values will use the in-cluster configuration | no | `""` |
| EXECUTION_INTERVAL | Duration to sleep between each execution in seconds | no | `20` |
| EXECUTION_TIMEOUT | Maximum execution duration before timing out in seconds | no | `900` |
| METRICS_PORT | Port to bind metrics server to | no | `8080` |
| METRICS | Expose metrics in Promtheus format at `:${METRICS_PORT}/metrics` | no | `""` |
| Environment variable | Description | Required | Default |
|:-----------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------|:------------|
| CLUSTER_NAME | Name of the eks-cluster, used in place of `AUTODISCOVERRY_TAGS` and `AUTO_SCALING_GROUP_NAMES`. Checks for `k8s.io/cluster-autoscaler/<CLUSTER_NAME>: owned` and `k8s.io/cluster-autoscaler/enabled: true` tags on ASG | yes | `""` |
| AUTODISCOVERY_TAGS | Comma separated key value string with tags to autodiscover ASGs, used in place of `CLUSTER_NAME` and `AUTO_SCALING_GROUP_NAMES`. | yes | `""` |
| AUTO_SCALING_GROUP_NAMES | Comma-separated list of ASGs, CLUSTER_NAME takes priority. | yes | `""` |
| IGNORE_DAEMON_SETS | Whether to ignore DaemonSets when draining the nodes | no | `true` |
| DELETE_EMPTY_DIR_DATA | Whether to delete empty dir data when draining the nodes | no | `true` |
| AWS_REGION | Self-explanatory | no | `us-west-2` |
| ENVIRONMENT | If set to `dev`, will try to create the Kubernetes client using your local kubeconfig. Any other values will use the in-cluster configuration | no | `""` |
| EXECUTION_INTERVAL | Duration to sleep between each execution in seconds | no | `20` |
| EXECUTION_TIMEOUT | Maximum execution duration before timing out in seconds | no | `900` |
| POD_TERMINATION_GRACE_PERIOD | How long to wait for a pod to terminate in seconds; 0 means "delete immediately"; set to a negative value to use the pod's terminationGracePeriodSeconds. | no | `-1` |
| METRICS_PORT | Port to bind metrics server to | no | `8080` |
| METRICS | Expose metrics in Promtheus format at `:${METRICS_PORT}/metrics` | no | `""` |


## Metrics
Expand Down
60 changes: 36 additions & 24 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,35 @@ import (
var cfg *config

const (
EnvEnvironment = "ENVIRONMENT"
EnvDebug = "DEBUG"
EnvIgnoreDaemonSets = "IGNORE_DAEMON_SETS"
EnvDeleteLocalData = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA)
EnvDeleteEmptyDirData = "DELETE_EMPTY_DIR_DATA"
EnvClusterName = "CLUSTER_NAME"
EnvAutodiscoveryTags = "AUTODISCOVERY_TAGS"
EnvAutoScalingGroupNames = "AUTO_SCALING_GROUP_NAMES"
EnvAwsRegion = "AWS_REGION"
EnvExecutionInterval = "EXECUTION_INTERVAL"
EnvExecutionTimeout = "EXECUTION_TIMEOUT"
EnvMetrics = "METRICS"
EnvMetricsPort = "METRICS_PORT"
EnvEnvironment = "ENVIRONMENT"
EnvDebug = "DEBUG"
EnvIgnoreDaemonSets = "IGNORE_DAEMON_SETS"
EnvDeleteLocalData = "DELETE_LOCAL_DATA" // Deprecated: in favor of DeleteEmptyDirData (DELETE_EMPTY_DIR_DATA)
EnvDeleteEmptyDirData = "DELETE_EMPTY_DIR_DATA"
EnvClusterName = "CLUSTER_NAME"
EnvAutodiscoveryTags = "AUTODISCOVERY_TAGS"
EnvAutoScalingGroupNames = "AUTO_SCALING_GROUP_NAMES"
EnvAwsRegion = "AWS_REGION"
EnvExecutionInterval = "EXECUTION_INTERVAL"
EnvExecutionTimeout = "EXECUTION_TIMEOUT"
EnvPodTerminationGracePeriod = "POD_TERMINATION_GRACE_PERIOD"
EnvMetrics = "METRICS"
EnvMetricsPort = "METRICS_PORT"
)

type config struct {
Environment string // Optional
Debug bool // Defaults to false
AutoScalingGroupNames []string // Required if AutodiscoveryTags not provided
AutodiscoveryTags string // Required if AutoScalingGroupNames not provided
AwsRegion string // Defaults to us-west-2
IgnoreDaemonSets bool // Defaults to true
DeleteEmptyDirData bool // Defaults to true
ExecutionInterval time.Duration // Defaults to 20s
ExecutionTimeout time.Duration // Defaults to 900s
Metrics bool // Defaults to false
MetricsPort int // Defaults to 8080
Environment string // Optional
Debug bool // Defaults to false
AutoScalingGroupNames []string // Required if AutodiscoveryTags not provided
AutodiscoveryTags string // Required if AutoScalingGroupNames not provided
AwsRegion string // Defaults to us-west-2
IgnoreDaemonSets bool // Defaults to true
DeleteEmptyDirData bool // Defaults to true
ExecutionInterval time.Duration // Defaults to 20s
ExecutionTimeout time.Duration // Defaults to 900s
PodTerminationGracePeriod int // Defaults to -1
Metrics bool // Defaults to false
MetricsPort int // Defaults to 8080
}

// Initialize is used to initialize the application's configuration
Expand Down Expand Up @@ -111,6 +113,16 @@ func Initialize() error {
log.Printf("Environment variable '%s' not specified, defaulting to 900 seconds", EnvExecutionTimeout)
cfg.ExecutionTimeout = time.Second * 900
}
if terminationGracePeriod := os.Getenv(EnvPodTerminationGracePeriod); len(terminationGracePeriod) > 0 {
if gracePeriod, err := strconv.Atoi(terminationGracePeriod); err != nil {
return fmt.Errorf("environment variable '%s' must be an integer", EnvPodTerminationGracePeriod)
} else {
cfg.PodTerminationGracePeriod = gracePeriod
}
} else {
log.Printf("Environment variable '%s' not specified, defaulting to -1 (pod's terminationGracePeriodSeconds)", EnvPodTerminationGracePeriod)
cfg.PodTerminationGracePeriod = -1
}
return nil
}

Expand Down
6 changes: 3 additions & 3 deletions k8s/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type ClientAPI interface {
GetNodeByAutoScalingInstance(instance *autoscaling.Instance) (*v1.Node, error)
FilterNodeByAutoScalingInstance(nodes []v1.Node, instance *autoscaling.Instance) (*v1.Node, error)
UpdateNode(node *v1.Node) error
Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool) error
Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error
}

type Client struct {
Expand Down Expand Up @@ -108,7 +108,7 @@ func (k *Client) UpdateNode(node *v1.Node) error {
}

// Drain gracefully deletes all pods from a given node
func (k *Client) Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool) error {
func (k *Client) Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData bool, podTerminationGracePeriod int) error {
node, err := k.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
if err != nil {
return err
Expand All @@ -118,7 +118,7 @@ func (k *Client) Drain(nodeName string, ignoreDaemonSets, deleteEmptyDirData boo
Force: true,
IgnoreAllDaemonSets: ignoreDaemonSets,
DeleteEmptyDirData: deleteEmptyDirData,
GracePeriodSeconds: -1,
GracePeriodSeconds: podTerminationGracePeriod,
Timeout: 5 * time.Minute,
Ctx: context.TODO(),
Out: drainLogger{NodeName: nodeName},
Expand Down
2 changes: 1 addition & 1 deletion k8s/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
func TestClient_Drain(t *testing.T) {
fakeKubernetesClient := fakekubernetes.NewSimpleClientset(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "default"}})
kc := NewClient(fakeKubernetesClient)
err := kc.Drain("default", true, true)
err := kc.Drain("default", true, true, -1)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion k8stest/k8stest.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ func (mock *MockClient) UpdateNode(node *v1.Node) error {
return nil
}

func (mock *MockClient) Drain(nodeName string, ignoreDaemonSets, deleteLocalData bool) error {
func (mock *MockClient) Drain(nodeName string, ignoreDaemonSets, deleteLocalData bool, podTerminationGracePeriod int) error {
mock.Counter["Drain"]++
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
log.Printf("[%s][%s] Updated nodes have enough resources available", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
if minutesSinceDrained == -1 {
log.Printf("[%s][%s] Draining node", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId))
err := client.Drain(node.Name, config.Get().IgnoreDaemonSets, config.Get().DeleteEmptyDirData)
err := client.Drain(node.Name, config.Get().IgnoreDaemonSets, config.Get().DeleteEmptyDirData, config.Get().PodTerminationGracePeriod)
if err != nil {
metrics.Server.Errors.Inc()
log.Printf("[%s][%s] Skipping because ran into error while draining node: %v", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.StringValue(outdatedInstance.InstanceId), err.Error())
Expand Down

0 comments on commit 8f0207a

Please sign in to comment.