From dc896f930395d04f4ab7848723644413f47124c3 Mon Sep 17 00:00:00 2001 From: Alexey Makhov Date: Tue, 21 May 2024 14:01:02 +0300 Subject: [PATCH] K0sControlPlane update strategies Signed-off-by: Alexey Makhov --- .github/workflows/go.yml | 1 + .github/workflows/prepare-build-env.sh | 20 ++++ api/controlplane/v1beta1/k0s_types.go | 2 +- .../bootstrap/providerid_controller.go | 3 +- internal/controller/controlplane/helper.go | 16 ++- .../k0s_controlplane_controller.go | 32 ++--- internal/util/dynamic_config.go | 11 +- inttest/Makefile | 3 +- inttest/Makefile.variables | 2 +- ..._machine_template_update_recreate_test.go} | 111 +++++++++++++----- .../capi_remote_machine_template_test.go | 8 +- inttest/util/docker.go | 19 +++ 12 files changed, 165 insertions(+), 63 deletions(-) create mode 100755 .github/workflows/prepare-build-env.sh rename inttest/{capi-docker-machine-template-update-rollout/capi_docker_machine_template_update_rollout_test.go => capi-docker-machine-template-update-recreate/capi_docker_machine_template_update_recreate_test.go} (74%) create mode 100644 inttest/util/docker.go diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index d3daa7988..bbfd9eb26 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -159,6 +159,7 @@ jobs: - check-capi-controlplane-docker-worker - check-capi-remote-machine-template-update - check-capi-docker-machine-template-update + - check-capi-docker-machine-template-update-recreate - check-capi-remote-machine-job-provision uses: ./.github/workflows/capi-smoke-tests.yml diff --git a/.github/workflows/prepare-build-env.sh b/.github/workflows/prepare-build-env.sh new file mode 100755 index 000000000..f109421c5 --- /dev/null +++ b/.github/workflows/prepare-build-env.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env sh + +set -eu + +goVersion="$(./vars.sh go_version)" +golangciLintVersion="$(./vars.sh FROM=hack/tools golangci-lint_version)" +pythonVersion="$(./vars.sh FROM=docs python_version)" + +cat <>"$GITHUB_ENV" +GO_VERSION=$goVersion +GOLANGCI_LINT_VERSION=$golangciLintVersion +PYTHON_VERSION=$pythonVersion +EOF + +# shellcheck disable=SC1090 +. "$GITHUB_ENV" + +echo ::group::OS Environment +env | sort +echo ::endgroup:: diff --git a/api/controlplane/v1beta1/k0s_types.go b/api/controlplane/v1beta1/k0s_types.go index eb131f620..ce8a3d59f 100644 --- a/api/controlplane/v1beta1/k0s_types.go +++ b/api/controlplane/v1beta1/k0s_types.go @@ -31,7 +31,7 @@ func init() { type UpdateStrategy string const ( - UpdateInPlace UpdateStrategy = "InPlace" + UpdateInPlace UpdateStrategy = "InPlace" UpdateRecreate UpdateStrategy = "Recreate" ) diff --git a/internal/controller/bootstrap/providerid_controller.go b/internal/controller/bootstrap/providerid_controller.go index 896de8096..31294f929 100644 --- a/internal/controller/bootstrap/providerid_controller.go +++ b/internal/controller/bootstrap/providerid_controller.go @@ -48,7 +48,8 @@ func (p *ProviderIDController) Reconcile(ctx context.Context, req ctrl.Request) } if machine.Spec.ProviderID == nil || *machine.Spec.ProviderID == "" { - return ctrl.Result{}, fmt.Errorf("waiting for providerID for the machine %s/%s", machine.Namespace, machine.Name) + log.Info("waiting for providerID for the machine " + machine.Name) + return ctrl.Result{RequeueAfter: time.Second * 10}, nil } cluster, err := capiutil.GetClusterByName(ctx, p.Client, machine.Namespace, machine.Spec.ClusterName) diff --git a/internal/controller/controlplane/helper.go b/internal/controller/controlplane/helper.go index d0042df19..01317048c 100644 --- a/internal/controller/controlplane/helper.go +++ b/internal/controller/controlplane/helper.go @@ -189,12 +189,20 @@ func (c *K0sController) markChildControlNodeToLeave(ctx context.Context, name st err := clientset.RESTClient(). Patch(types.MergePatchType). - AbsPath("/apis/autopilot.k0sproject.io/v1beta2/controlnodes/" + name). - Body([]byte(`{"metadata":{"annotations":{"k0smotron.io/leave":"true"}}}`)). + AbsPath("/apis/etcd.k0sproject.io/v1beta1/etcdmembers/" + name). + Body([]byte(`{"spec":{"leave":"true"}}`)). Do(ctx). Error() - if err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("error marking control node to leave: %w", err) + if err != nil { + err := clientset.RESTClient(). + Patch(types.MergePatchType). + AbsPath("/apis/autopilot.k0sproject.io/v1beta2/controlnodes/" + name). + Body([]byte(`{"metadata":{"annotations":{"k0smotron.io/leave":"true"}}}`)). + Do(ctx). + Error() + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("error marking control node to leave: %w", err) + } } return nil diff --git a/internal/controller/controlplane/k0s_controlplane_controller.go b/internal/controller/controlplane/k0s_controlplane_controller.go index 6d8dd1a14..00a288b82 100644 --- a/internal/controller/controlplane/k0s_controlplane_controller.go +++ b/internal/controller/controlplane/k0s_controlplane_controller.go @@ -20,12 +20,12 @@ import ( "context" "errors" "fmt" - "strings" - "time" - "github.com/Masterminds/semver" "github.com/google/uuid" autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2" + bootstrapv1 "github.com/k0sproject/k0smotron/api/bootstrap/v1beta1" + cpv1beta1 "github.com/k0sproject/k0smotron/api/controlplane/v1beta1" + "github.com/k0sproject/k0smotron/internal/controller/util" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -46,10 +46,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - - bootstrapv1 "github.com/k0sproject/k0smotron/api/bootstrap/v1beta1" - cpv1beta1 "github.com/k0sproject/k0smotron/api/controlplane/v1beta1" - "github.com/k0sproject/k0smotron/internal/controller/util" + "strings" ) const ( @@ -230,10 +227,8 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv replicasToReport = kcp.Status.Replicas } - fmt.Println("asdfsadfsdafsdafa111") - if kcp.Status.Version != "" && kcp.Spec.Version != kcp.Status.Version { - if kcp.Spec.UpdateStrategy == "rollout" { + if kcp.Spec.UpdateStrategy == cpv1beta1.UpdateRecreate { desiredReplicas += kcp.Spec.Replicas machinesToDelete = int(kcp.Spec.Replicas) replicasToReport = desiredReplicas @@ -243,9 +238,10 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv return replicasToReport, fmt.Errorf("error getting cluster client set for machine update: %w", err) } - err = c.createAutopilotPlan(ctx, kcp, cluster, kubeClient) - if err != nil { - return replicasToReport, fmt.Errorf("error creating autopilot plan: %w", err) + err = c.createAutopilotPlan(ctx, kcp, cluster, kubeClient) + if err != nil { + return replicasToReport, fmt.Errorf("error creating autopilot plan: %w", err) + } } } @@ -278,11 +274,6 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv } } - fmt.Println("asdfsadfsdafsdafa") - fmt.Println("machines", machines) - fmt.Println("machinesToDelete", machinesToDelete) - - //var isNewMachineReady bool for _, m := range machines { ver := semver.MustParse(kcp.Spec.Version) fmt.Println("machines ver", machinesToDelete, *m.Spec.Version, fmt.Sprintf("v%d.%d.%d", ver.Major(), ver.Minor(), ver.Patch()), m.Spec.Version != nil && *m.Spec.Version != fmt.Sprintf("v%d.%d.%d", ver.Major(), ver.Minor(), ver.Patch())) @@ -291,13 +282,12 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv } if machinesToDelete > 0 { - kubeClient, err := c.getKubeClient(ctx, cluster) if err != nil { return replicasToReport, fmt.Errorf("error getting cluster client set for machine update: %w", err) } var cn autopilot.ControlNode - err = kubeClient.RESTClient().Get().AbsPath("/apis/autopilot.k0sproject.io/v1beta2/controlnodes").Name(m.Name).Do(ctx).Into(&cn) + err = kubeClient.RESTClient().Get().AbsPath("/apis/autopilot.k0sproject.io/v1beta2/controlnodes/" + m.Name).Do(ctx).Into(&cn) fmt.Println("machines !!!", cn.Name, cn.Status) if err != nil { if apierrors.IsNotFound(err) { @@ -340,7 +330,7 @@ func (c *K0sController) reconcileMachines(ctx context.Context, cluster *clusterv return kcp.Status.Replicas, fmt.Errorf("waiting for previous machine to be deleted") } - time.Sleep(time.Second * 10) + //time.Sleep(time.Second * 10) replicasToReport -= 1 name := machine.Name diff --git a/internal/util/dynamic_config.go b/internal/util/dynamic_config.go index 7a00ad0b5..4faef7ac9 100644 --- a/internal/util/dynamic_config.go +++ b/internal/util/dynamic_config.go @@ -3,8 +3,11 @@ package util import ( "context" "fmt" + "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" "sigs.k8s.io/cluster-api/controllers/remote" "sigs.k8s.io/cluster-api/util" @@ -31,7 +34,12 @@ func ReconcileDynamicConfig(ctx context.Context, cluster metav1.Object, cli clie return fmt.Errorf("failed to create workload cluster client: %w", err) } - err = retry.OnError(retry.DefaultBackoff, func(err error) bool { + err = retry.OnError(wait.Backoff{ + Steps: 4, + Duration: 100 * time.Millisecond, + Factor: 5.0, + Jitter: 0.5, + }, func(err error) bool { return true }, func() error { return chCS.Patch(ctx, u, client.RawPatch(client.Merge.Type(), b), []client.PatchOption{}...) @@ -39,7 +47,6 @@ func ReconcileDynamicConfig(ctx context.Context, cluster metav1.Object, cli clie if err != nil { return fmt.Errorf("failed to patch k0s config: %w", err) } - //return chCS.Patch(ctx, u, client.RawPatch(client.Merge.Type(), b), []client.PatchOption{}...) return nil } diff --git a/inttest/Makefile b/inttest/Makefile index 3c9d763bd..83cda6257 100644 --- a/inttest/Makefile +++ b/inttest/Makefile @@ -48,4 +48,5 @@ check-capi-remote-machine-template: TIMEOUT=12m check-capi-remote-machine-template-update: TIMEOUT=10m check-capi-docker-machine-template-update: TIMEOUT=10m check-capi-remote-machine-job-provision: TIMEOUT=10m -check-capi-docker-machine-template-update-rollout: TIMEOUT=10m +check-capi-docker-machine-template-update: TIMEOUT=10m +check-capi-docker-machine-template-update-recreate: TIMEOUT=10m diff --git a/inttest/Makefile.variables b/inttest/Makefile.variables index 8b99c2ad7..31489cc05 100644 --- a/inttest/Makefile.variables +++ b/inttest/Makefile.variables @@ -30,4 +30,4 @@ smoketests := \ check-capi-remote-machine-template \ check-capi-remote-machine-template-update \ check-capi-docker-machine-template-update \ - check-capi-docker-machine-template-update-rollout \ + check-capi-docker-machine-template-update-recreate \ diff --git a/inttest/capi-docker-machine-template-update-rollout/capi_docker_machine_template_update_rollout_test.go b/inttest/capi-docker-machine-template-update-recreate/capi_docker_machine_template_update_recreate_test.go similarity index 74% rename from inttest/capi-docker-machine-template-update-rollout/capi_docker_machine_template_update_rollout_test.go rename to inttest/capi-docker-machine-template-update-recreate/capi_docker_machine_template_update_recreate_test.go index 45a04650e..f5612c28e 100644 --- a/inttest/capi-docker-machine-template-update-rollout/capi_docker_machine_template_update_rollout_test.go +++ b/inttest/capi-docker-machine-template-update-recreate/capi_docker_machine_template_update_recreate_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package capidockermachinetemplateupdaterollout +package capidockermachinetemplateupdaterecreate import ( "context" @@ -38,8 +38,8 @@ import ( "k8s.io/client-go/tools/clientcmd" ) -type CAPIDockerMachineTemplateUpdateRollout struct { - //type CAPIDockerMachineTemplateUpdateRollout struct { +type CAPIDockerMachineTemplateUpdateRecreate struct { + //type CAPIDockerMachineTemplateUpdateRecreate struct { suite.Suite client *kubernetes.Clientset restConfig *rest.Config @@ -48,12 +48,12 @@ type CAPIDockerMachineTemplateUpdateRollout struct { ctx context.Context } -func TestCAPIDockerMachineTemplateUpdateRollout(t *testing.T) { - s := CAPIDockerMachineTemplateUpdateRollout{} +func TestCAPIDockerMachineTemplateUpdateRecreate(t *testing.T) { + s := CAPIDockerMachineTemplateUpdateRecreate{} suite.Run(t, &s) } -func (s *CAPIDockerMachineTemplateUpdateRollout) SetupSuite() { +func (s *CAPIDockerMachineTemplateUpdateRecreate) SetupSuite() { kubeConfigPath := os.Getenv("KUBECONFIG") s.Require().NotEmpty(kubeConfigPath, "KUBECONFIG env var must be set and point to kind cluster") // Get kube client from kubeconfig @@ -77,7 +77,7 @@ func (s *CAPIDockerMachineTemplateUpdateRollout) SetupSuite() { s.ctx, _ = util.NewSuiteContext(s.T()) } -func (s *CAPIDockerMachineTemplateUpdateRollout) TestCAPIControlPlaneDockerDownScaling() { +func (s *CAPIDockerMachineTemplateUpdateRecreate) TestCAPIControlPlaneDockerDownScaling() { // Apply the child cluster objects s.applyClusterObjects() @@ -117,28 +117,83 @@ func (s *CAPIDockerMachineTemplateUpdateRollout) TestCAPIControlPlaneDockerDownS }) s.Require().NoError(err) - //for i := 0; i < 3; i++ { - // // nolint:staticcheck - // err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { - // nodeName := fmt.Sprintf("docker-test-%d", i) - // output, err := exec.Command("docker", "exec", nodeName, "k0s", "status").Output() - // if err != nil { - // return false, nil - // } - // - // return strings.Contains(string(output), "Version:"), nil - // }) - // s.Require().NoError(err) - //} - time.Sleep(time.Minute * 3) + var nodeIDs []string + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + var err error + nodeIDs, err = util.GetControlPlaneNodesIDs("docker-test-") + + if err != nil { + return false, nil + } + + return len(nodeIDs) == 3, nil + }) + + for i := 0; i < 3; i++ { + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + nodeID := nodeIDs[i] + output, err := exec.Command("docker", "exec", nodeID, "k0s", "status").Output() + if err != nil { + return false, nil + } + + return strings.Contains(string(output), "Version:"), nil + }) + s.Require().NoError(err) + } s.T().Log("waiting for node to be ready") s.Require().NoError(k0stestutil.WaitForNodeReadyStatus(s.ctx, kmcKC, "docker-test-worker-0", corev1.ConditionTrue)) + s.T().Log("updating cluster objects") s.updateClusterObjects() + + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 100*time.Millisecond, func(ctx context.Context) (bool, error) { + var err error + newNodeIDs, err := util.GetControlPlaneNodesIDs("docker-test-") + + if err != nil { + return false, nil + } + + return len(newNodeIDs) == 6, nil + }) + + //for i := range nodeIDs { + // out, err := exec.Command("docker", "stop", nodeIDs[i]).CombinedOutput() + // s.Require().NoError(err, "failed to stop node: %s", string(out)) + //} + + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 100*time.Millisecond, func(ctx context.Context) (bool, error) { + var err error + nodeIDs, err = util.GetControlPlaneNodesIDs("docker-test-") + + if err != nil { + return false, nil + } + + return len(nodeIDs) == 3, nil + }) + + // nolint:staticcheck + err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { + var err error + nodeIDs, err = util.GetControlPlaneNodesIDs("docker-test-") + + if err != nil { + return false, nil + } + + return len(nodeIDs) == 3, nil + }) + // nolint:staticcheck err = wait.PollImmediateUntilWithContext(s.ctx, 1*time.Second, func(ctx context.Context) (bool, error) { - output, err := exec.Command("docker", "exec", "docker-test-0", "k0s", "status").CombinedOutput() + output, err := exec.Command("docker", "exec", nodeIDs[0], "k0s", "status").CombinedOutput() if err != nil { return false, nil } @@ -146,23 +201,21 @@ func (s *CAPIDockerMachineTemplateUpdateRollout) TestCAPIControlPlaneDockerDownS return strings.Contains(string(output), "Version: v1.28"), nil }) s.Require().NoError(err) - - s.Require().NoError(k0stestutil.WaitForNodeReadyStatus(s.ctx, kmcKC, "docker-test-worker-0", corev1.ConditionTrue)) } -func (s *CAPIDockerMachineTemplateUpdateRollout) applyClusterObjects() { +func (s *CAPIDockerMachineTemplateUpdateRecreate) applyClusterObjects() { // Exec via kubectl out, err := exec.Command("kubectl", "apply", "-f", s.clusterYamlsPath).CombinedOutput() s.Require().NoError(err, "failed to apply cluster objects: %s", string(out)) } -func (s *CAPIDockerMachineTemplateUpdateRollout) updateClusterObjects() { +func (s *CAPIDockerMachineTemplateUpdateRecreate) updateClusterObjects() { // Exec via kubectl out, err := exec.Command("kubectl", "apply", "-f", s.clusterYamlsUpdatePath).CombinedOutput() s.Require().NoError(err, "failed to update cluster objects: %s", string(out)) } -func (s *CAPIDockerMachineTemplateUpdateRollout) deleteCluster() { +func (s *CAPIDockerMachineTemplateUpdateRecreate) deleteCluster() { // Exec via kubectl out, err := exec.Command("kubectl", "delete", "-f", s.clusterYamlsPath).CombinedOutput() s.Require().NoError(err, "failed to delete cluster objects: %s", string(out)) @@ -223,7 +276,7 @@ metadata: spec: replicas: 3 version: v1.27.1+k0s.0 - updateStrategy: rollout + updateStrategy: Recreate k0sConfigSpec: k0s: apiVersion: k0s.k0sproject.io/v1beta1 @@ -293,7 +346,7 @@ metadata: spec: replicas: 3 version: v1.28.7+k0s.0 - updateStrategy: rollout + updateStrategy: Recreate k0sConfigSpec: k0s: apiVersion: k0s.k0sproject.io/v1beta1 diff --git a/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go b/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go index abe9de585..92069bed5 100644 --- a/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go +++ b/inttest/capi-remote-machine-template/capi_remote_machine_template_test.go @@ -156,7 +156,8 @@ func (s *RemoteMachineTemplateSuite) TestCAPIRemoteMachine() { err = wait.PollImmediateUntilWithContext(ctx, 1*time.Second, func(ctx context.Context) (bool, error) { rm, err := s.findRemoteMachines("default") if err != nil { - return false, err + s.T().Errorf("failed to list RemoteMachines: %v", err) + return false, nil } if len(rm) == 0 { @@ -183,13 +184,14 @@ func (s *RemoteMachineTemplateSuite) TestCAPIRemoteMachine() { s.T().Log("waiting for node to be ready") s.Require().NoError(common.WaitForNodeReadyStatus(ctx, kmcKC, rmName, corev1.ConditionTrue)) + s.T().Log("waiting for node to have the correct providerID") err = wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (done bool, err error) { - node, err := kmcKC.CoreV1().Nodes().Get(ctx, "remote-test-0", metav1.GetOptions{}) + node, err := kmcKC.CoreV1().Nodes().Get(ctx, rmName, metav1.GetOptions{}) if err != nil { return false, err } - return node.Labels["k0smotron.io/machine-name"] == "remote-test-0" && node.Spec.ProviderID == expectedProviderID, nil + return node.Labels["k0smotron.io/machine-name"] == rmName && node.Spec.ProviderID == expectedProviderID, nil }) s.Require().NoError(err) diff --git a/inttest/util/docker.go b/inttest/util/docker.go new file mode 100644 index 000000000..bab6dcb59 --- /dev/null +++ b/inttest/util/docker.go @@ -0,0 +1,19 @@ +package util + +import ( + "fmt" + "os/exec" + "strings" +) + +func GetControlPlaneNodesIDs(prefix string) ([]string, error) { + out, err := exec.Command("/bin/sh", "-c", fmt.Sprintf(`docker ps | grep %s | grep -v "\-lb" | grep -v worker | awk '{print $1}'`, prefix)).Output() + if err != nil { + return nil, err + } + + if string(out) == "" { + return []string{}, nil + } + return strings.Split(strings.Trim(string(out), "\n "), "\n"), nil +}