Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
Signed-off-by: Salvatore Daniele <[email protected]>
  • Loading branch information
SalDaniele committed Jul 8, 2024
1 parent 3a73ff2 commit 94daae2
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 56 deletions.
121 changes: 65 additions & 56 deletions extraConfigDpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,63 +322,72 @@ def ExtraConfigDpuHost(cc: ClustersConfig, cfg: ExtraConfigArgs, futures: dict[s
logger.info("Ensuring local registry is trusted in OCP")
reglocal.ocp_trust(client, reglocal.get_local_registry_base_directory(lh), reglocal.get_local_registry_hostname(lh), 5000)

h = host.Host(cc.workers[0].node)
vendor_plugin = init_vendor_plugin(h)
vendor_plugin.build_and_start(lh, client, registry)

start_dpu_operator(lh, client, operator_image, daemon_image)
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")

def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
# Temporary workaround, remove once 4.16 installations are working
logger.info("Ensuring Rhel 9.4 kernel is installed")
ensure_rhel_9_4_kernel_is_installed(h)
# There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
# As a result, we will need to trigger cold boots of the node until the device is available
# TODO: Remove when no longer needed
retries = 3
h.ssh_connect("core")
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
while ret.returncode != 0:
logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
h.cold_boot()
logger.info("Cold boot triggered, waiting for host to reboot")
time.sleep(60)
h.ssh_connect("core")
retries = retries - 1
if retries == 0:
logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")

# Label the node
logger.info(f"labeling node {h.hostname()} dpu=true")
client.oc_run_or_die(f"label no {e.name} dpu=true")
return None

executor = ThreadPoolExecutor(max_workers=len(cc.workers))
f = []
# Assuming that all workers have a DPU
for e in cc.workers:
logger.info(f"Calling helper function for node {e.node}")
bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
h = host.Host(e.node, bmc)
f.append(executor.submit(helper, h, e))

for thread in f:
logger.info(thread.result())

logger.info("Verified idpf is providing net-devs on DPU worker nodes")

# Create host nad
# TODO: Remove when this is automatically created by the dpu operator
logger.info("Creating dpu NAD")
client.oc("delete -f manifests/dpu/dpu_nad.yaml")
client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
# Deploy dpu daemon and wait for dpu pods to come up
logger.info("Creating dpu operator config")
client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
time.sleep(30)
client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
logger.info("creating test container")
image = "alpine:latest"
name = "ipu_host_test"
cmd = f"podman pull {image}"
lh.run_or_die(cmd)
cmd = f"podman run -d --name {name} {image} sh -c 'while true; do sleep 1; done'"
lh.run_or_die(cmd)

# h = host.Host(cc.workers[0].node)
# vendor_plugin = init_vendor_plugin(h)
# vendor_plugin.build_and_start(lh, client, registry)

# start_dpu_operator(lh, client, operator_image, daemon_image)
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=2m")

# def helper(h: host.Host, node: NodeConfig) -> Optional[host.Result]:
# # Temporary workaround, remove once 4.16 installations are working
# logger.info("Ensuring Rhel 9.4 kernel is installed")
# ensure_rhel_9_4_kernel_is_installed(h)
# # There is a bug with the idpf driver that causes the IPU to fail to be enumerated over PCIe on boot
# # As a result, we will need to trigger cold boots of the node until the device is available
# # TODO: Remove when no longer needed
# retries = 3
# h.ssh_connect("core")
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")
# while ret.returncode != 0:
# logger.error(f"{h.hostname()} does not have a network device {cfg.dpu_net_interface} cold booting node to try to recover")
# h.cold_boot()
# logger.info("Cold boot triggered, waiting for host to reboot")
# time.sleep(60)
# h.ssh_connect("core")
# retries = retries - 1
# if retries == 0:
# logger.error_and_exit(f"Failed to bring up IPU net device on {h.hostname()}")
# ret = h.run(f"test -d /sys/class/net/{cfg.dpu_net_interface}")

# # Label the node
# logger.info(f"labeling node {h.hostname()} dpu=true")
# client.oc_run_or_die(f"label no {e.name} dpu=true")
# return None

# executor = ThreadPoolExecutor(max_workers=len(cc.workers))
# f = []
# # Assuming that all workers have a DPU
# for e in cc.workers:
# logger.info(f"Calling helper function for node {e.node}")
# bmc = host.BMC.from_bmc(e.bmc, e.bmc_user, e.bmc_password)
# h = host.Host(e.node, bmc)
# f.append(executor.submit(helper, h, e))

# for thread in f:
# logger.info(thread.result())

# logger.info("Verified idpf is providing net-devs on DPU worker nodes")

# # Create host nad
# # TODO: Remove when this is automatically created by the dpu operator
# logger.info("Creating dpu NAD")
# client.oc("delete -f manifests/dpu/dpu_nad.yaml")
# client.oc_run_or_die("create -f manifests/dpu/dpu_nad.yaml")
# # Deploy dpu daemon and wait for dpu pods to come up
# logger.info("Creating dpu operator config")
# client.oc_run_or_die(f"create -f {REPO_DIR}/examples/dpu.yaml")
# time.sleep(30)
# client.oc_run_or_die("wait --for=condition=Ready pod --all -n openshift-dpu-operator --timeout=5m")
logger.info("Finished setting up dpu operator on host")


Expand Down
5 changes: 5 additions & 0 deletions reglocal.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ def ensure_running(rsh: host.Host, *, delete_all: bool = False, listen_port: int
)
)

# Add logging to check the status of the container
logger.info("Checking if the container is still running")
status_ret = rsh.run(shlex.join(["podman", "ps", "-a", "--filter", f"name={CONTAINER_NAME}"]))
logger.info(f"Container status: {status_ret.out}")

return dir_name, hostname, listen_port, ret.out.strip()


Expand Down

0 comments on commit 94daae2

Please sign in to comment.