From c685429be8d2e150c2b56ab1602a0edb83d102a6 Mon Sep 17 00:00:00 2001 From: Salvatore Daniele Date: Wed, 16 Oct 2024 09:57:40 -0400 Subject: [PATCH] extraConfigDpu: start p4 pod w/ retry Signed-off-by: Salvatore Daniele --- common.py | 2 +- extraConfigDpu.py | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/common.py b/common.py index b3a74b2b..60c7af75 100644 --- a/common.py +++ b/common.py @@ -72,7 +72,7 @@ def wrapped_init(self, *args, **argv): # type: ignore name = field.name value = getattr(self, name) type_hint = field.type - if not check_type(value, type_hint): + if not check_type(value, type_hint): # type: ignore raise TypeError(f"Expected type '{type_hint}' for attribute '{name}' but received type '{type(value)}')") # Normally, data classes support __post_init__(), which is called by __init__() diff --git a/extraConfigDpu.py b/extraConfigDpu.py index e9c0facf..a3eaa3b2 100644 --- a/extraConfigDpu.py +++ b/extraConfigDpu.py @@ -16,6 +16,7 @@ DPU_OPERATOR_REPO = "https://github.com/openshift/dpu-operator.git" MICROSHIFT_KUBECONFIG = "/root/kubeconfig.microshift" OSE_DOCKERFILE = "https://pkgs.devel.redhat.com/cgit/containers/dpu-operator/tree/Dockerfile?h=rhaos-4.17-rhel-9" +P4_IMG = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024" KERNEL_RPMS = [ "https://download-01.beak-001.prod.iad2.dc.redhat.com/brewroot/vol/rhel-9/packages/kernel/5.14.0/427.2.1.el9_4/x86_64/kernel-5.14.0-427.2.1.el9_4.x86_64.rpm", @@ -130,6 +131,7 @@ def dpu_operator_start(client: K8sClient, repo: Optional[str]) -> None: logger.info("Waiting for all dpu operator pods to become ready") time.sleep(30) client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m") + wait_vsp_ds_running(client) def wait_vsp_ds_running(client: K8sClient) -> None: @@ -148,6 +150,23 @@ def wait_vsp_ds_running(client: K8sClient) -> None: logger.error_and_exit("Vsp pods failed to reach ready state") +def ensure_p4_pod_running(lh: host.Host, acc: host.Host, imgReg: ImageRegistry) -> None: + lh.run_or_die(f"podman pull --tls-verify=false {P4_IMG}") + lh.run_or_die(f"podman tag {P4_IMG} {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024") + lh.run_or_die(f"podman push {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024") + uname = acc.run("uname -r").out.strip() + logger.info("Manually starting P4 container") + cmd = f"podman run --network host -d --privileged --entrypoint='[\"/bin/sh\", \"-c\", \"sleep 5; sh /entrypoint.sh\"]' -v /lib/modules/{uname}:/lib/modules/{uname} -v data1:/opt/p4 {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024" + acc.run_or_die(cmd) + # Occasionally the P4 pod fails to start + while True: + time.sleep(10) + if "intel-ipu-p4-sdk:10-9-2024" in acc.run("podman ps").out: + break + logger.info("Failed to start p4 container, retrying") + acc.run_or_die(cmd) + + def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str, Future[Optional[host.Result]]]) -> None: [f.result() for (_, f) in futures.items()] logger.info("Running post config step to start DPU operator on IPU") @@ -171,14 +190,8 @@ def ExtraConfigDpu(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[str, if isinstance(vendor_plugin, IpuPlugin): # TODO: Remove when this container is properly started by the vsp # We need to manually start the p4 sdk container currently for the IPU plugin - p4_img = "wsfd-advnetlab239.anl.eng.bos2.dc.redhat.com:5000/intel-ipu-p4-sdk:10-9-2024" - lh.run_or_die(f"podman pull --tls-verify=false {p4_img}") - lh.run_or_die(f"podman tag {p4_img} {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024") - lh.run_or_die(f"podman push {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024") - uname = acc.run("uname -r").out.strip() - logger.info("Manually starting P4 container") - cmd = f"podman run --network host -d --privileged --entrypoint='[\"/bin/sh\", \"-c\", \"sleep 5; sh /entrypoint.sh\"]' -v /lib/modules/{uname}:/lib/modules/{uname} -v data1:/opt/p4 {imgReg.url()}/intel-ipu-p4-sdk:10-9-2024" - acc.run_or_die(cmd) + ensure_p4_pod_running(lh, acc, imgReg) + # Build on the ACC since an aarch based server is needed for the build # (the Dockerfile needs to be fixed to allow layered multi-arch build # by removing the calls to pip)