diff --git a/cda.py b/cda.py index 29e54425..62211acc 100755 --- a/cda.py +++ b/cda.py @@ -15,7 +15,7 @@ def main_deploy(args: argparse.Namespace) -> None: cc = ClustersConfig(args.config, args.worker_range) # microshift does not use assisted installer so we don't need this check - if args.url == cc.cluster_ip_range[0] and not cc.kind == "microshift": + if args.url == cc.cluster_ip_range[0] and cc.kind == "openshift": ais = AssistedInstallerService(cc.version, args.url, cc.proxy, cc.noproxy) ais.start() # workaround, this will still install 4.14, but AI will think diff --git a/clusterDeployer.py b/clusterDeployer.py index 1337d314..b2b20860 100644 --- a/clusterDeployer.py +++ b/clusterDeployer.py @@ -25,6 +25,7 @@ import microshift from extraConfigRunner import ExtraConfigRunner from clusterHost import ClusterHost +import isoCluster def match_to_proper_version_format(version_cluster_config: str) -> str: @@ -191,7 +192,7 @@ def deploy(self) -> None: else: logger.info("Skipping pre configuration.") - if self._cc.kind != "microshift": + if self._cc.kind == "openshift": if "masters" in self.steps: self.teardown() self.create_cluster() @@ -211,7 +212,12 @@ def deploy(self) -> None: microshift.deploy(self._cc.fullConfig["name"], self._cc.masters[0], self._cc.external_port, version) else: logger.error_and_exit("Masters must be of length one for deploying microshift") - + if self._cc.kind == "iso": + if len(self._cc.masters) == 1: + self.deploy_cluster_from_iso() + else: + logger.error("Masters must be of length one for deploying from iso") + sys.exit(-1) if "post" in self.steps: self._postconfig() else: @@ -223,6 +229,9 @@ def _validate(self) -> None: if self._cc.masters[0].ip is None: logger.error_and_exit("Missing ip on master") + if self._cc.kind == "iso": + return + min_cores = 28 cc = int(self._local_host.hostconn.run("nproc").out) if cc < min_cores: @@ -599,3 +608,16 @@ def wait_for_workers(self) -> None: logger.info(e) time.sleep(30) + + def deploy_cluster_from_iso(self) -> None: + master = self._cc.masters[0] + if master.mac is None: + logger.error_and_exit(f"No MAC address provided for cluster {self._cc.name}, exiting") + if master.ip is None: + logger.error_and_exit(f"No IP address provided for cluster {self._cc.name}, exiting") + if master.name is None: + logger.error_and_exit(f"No name provided for cluster {self._cc.name}, exiting") + if not self._cc.network_api_port or self._cc.network_api_port == "auto": + logger.error_and_exit(f"Network API port with connection to {self._cc.name} must be specified, exiting") + + isoCluster.IPUIsoBoot(self._cc, master, self._cc.install_iso) diff --git a/clustersConfig.py b/clustersConfig.py index 470cfcde..b44f55be 100644 --- a/clustersConfig.py +++ b/clustersConfig.py @@ -142,6 +142,7 @@ class ClustersConfig: postconfig: List[ExtraConfigArgs] = [] ntp_source: str = "clock.redhat.com" base_dns_domain: str = "redhat.com" + install_iso: str = "" # All configurations that used to be supported but are not anymore. # Used to warn the user to change their config. @@ -178,6 +179,8 @@ def __init__(self, yaml_path: str, worker_range: common.RangeList): self.version = cc["version"] if "kind" in cc: self.kind = cc["kind"] + if self.kind == "iso": + self.install_iso = cc["install_iso"] if "network_api_port" in cc: self.network_api_port = cc["network_api_port"] self.name = cc["name"] @@ -363,7 +366,7 @@ def local_worker_vms(self) -> List[NodeConfig]: return [x for x in self.worker_vms() if x.node == "localhost"] def is_sno(self) -> bool: - return len(self.masters) == 1 and len(self.workers) == 0 + return len(self.masters) == 1 and len(self.workers) == 0 and self.kind == "openshift" def main() -> None: diff --git a/common.py b/common.py index 22da2203..48a8315d 100644 --- a/common.py +++ b/common.py @@ -1,7 +1,12 @@ from dataclasses import dataclass import ipaddress -from typing import List, Optional, Set, Tuple, TypeVar, Iterator +from typing import List, Optional, Set, Tuple, TypeVar, Iterator, Type +from types import TracebackType +import http.server +import socket +from multiprocessing import Process import host +from logger import logger import json import os import glob @@ -69,6 +74,49 @@ class IPRouteAddressEntry: addr_info: List[IPRouteAddressInfoEntry] +class HttpServerManager: + def __init__(self, path: str, port: int = 8000): + self.path = path + self.port = port + self.process: Optional[Process] = None + + def __enter__(self) -> 'HttpServerManager': + self.start_server() + return self + + def __exit__(self, exc_type: Optional[Type[BaseException]], exc_value: Optional[BaseException], traceback: Optional[TracebackType]) -> None: + self.stop_server() + + def start_server(self) -> None: + def target() -> None: + os.chdir(self.path) + server_address = ('', self.port) + httpd = http.server.HTTPServer(server_address, http.server.SimpleHTTPRequestHandler) + httpd.serve_forever() + + self.port = self.find_open_port() + self.process = Process(target=target) + self.process.start() + logger.info(f"Http Server started on port {self.port}") + + def stop_server(self) -> None: + if self.process: + self.process.terminate() + self.process.join() + logger.info("Http Server stopped") + + def port_is_in_use(self, port: int) -> bool: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) == 0 + + def find_open_port(self) -> int: + port = self.port + while self.port_is_in_use(port): + logger.debug(f"port {self.port} in use, trying port + 1") + port += 1 + return port + + def ipa(host: host.Host) -> str: return host.run("ip -json a").out diff --git a/dependencies.sh b/dependencies.sh index f3f228f5..57bf7980 100755 --- a/dependencies.sh +++ b/dependencies.sh @@ -23,7 +23,7 @@ gpgkey=https://pkgs.k8s.io/core:/stable:/v1.29/rpm/repodata/repomd.xml.key #exclude=kubelet kubeadm kubectl cri-tools kubernetes-cni EOF -dnf install -y wget rust coreos-installer kubectl libvirt podman qemu-img qemu-kvm virt-install make git golang-bin virt-viewer osbuild-composer composer-cli cockpit-composer bash-completion firewalld lorax +dnf install -y wget rust coreos-installer kubectl libvirt podman qemu-img qemu-kvm virt-install make git golang-bin virt-viewer osbuild-composer composer-cli cockpit-composer bash-completion firewalld lorax dhcp-server systemctl enable osbuild-composer.socket cockpit.socket --now diff --git a/host.py b/host.py index 0309f092..ccdd5fe5 100644 --- a/host.py +++ b/host.py @@ -146,9 +146,8 @@ def from_bmc(ip_or_hostname: str, user: str = "root", password: str = "calvin") """ - def boot_iso_redfish(self, iso_path: str) -> None: + def boot_iso_redfish(self, iso_path: str, retries: int = 10, retry_delay: int = 60) -> None: assert ":" in iso_path - retries = 10 for attempt in range(retries): try: self.boot_iso_with_retry(iso_path) @@ -157,7 +156,7 @@ def boot_iso_redfish(self, iso_path: str) -> None: if attempt == retries - 1: raise e else: - time.sleep(60) + time.sleep(retry_delay) def boot_iso_with_retry(self, iso_path: str) -> None: logger.info(iso_path) @@ -251,10 +250,9 @@ def ssh_connect_looped(self, logins: List[Login]) -> None: self._host = e.login() return except ssh_exception.AuthenticationException as e: - logger.info(type(e)) - raise e + logger.debug(type(e)) except Exception as e: - logger.info(type(e)) + logger.debug(type(e)) time.sleep(10) def _rsa_login(self) -> Optional[KeyLogin]: diff --git a/isoCluster.py b/isoCluster.py new file mode 100644 index 00000000..8a30f80a --- /dev/null +++ b/isoCluster.py @@ -0,0 +1,272 @@ +import sys +import os +from pathlib import Path +import shutil +import ipaddress +from dataclasses import dataclass +import re +import time +import urllib.parse +from concurrent.futures import Future +from concurrent.futures import ThreadPoolExecutor +from typing import Dict +from typing import List +from typing import Tuple +from logger import logger +from clustersConfig import ClustersConfig +from clustersConfig import NodeConfig +from clustersConfig import ExtraConfigArgs +import host +import common + + +""" +ExtraConfigIPU is used to provision and IPUs specified via Redfish through the IMC. +This works by making some assumptions about the current state of the IPU: +- The IMC is on MeV 1.2 / Mev 1.3 +- BMD_CONF has been set to allow for iso Boot +- ISCSI attempt has been added to allow for booting into the installed media +- The specified ISO contains full installation kickstart / kargs required for automated boot +- The specified ISO handles installing dependencies like dhclient and microshift +- The specified ISO architecture is aarch64 +- Password authentication has been enabled on IMC (required to be able to connect via paramiko) +- There is an additional connection between the provisioning host and the acc on an isolated subnet to serve dhcp / provide acc with www +""" + + +@dataclass +class DhcpdSubnetConfig: + subnet: str + netmask: str + range_start: str + range_end: str + broadcast_address: str + routers: str + dns_servers: list[str] + + +@dataclass +class DhcpdHostConfig: + hostname: str + hardware_ethernet: str + fixed_address: str + + +def get_subnet_ip(ipv4_address: str, subnet_mask: str) -> str: + subnet_mask_bits = subnet_mask.split('.') + prefix_length = sum(bin(int(octet)).count('1') for octet in subnet_mask_bits) + cidr_network = f"{ipv4_address}/{prefix_length}" + network = ipaddress.ip_network(cidr_network, strict=False).network_address + return str(network) + + +def get_subnet_range(ipv4_address: str, subnet_mask: str) -> Tuple[str, str]: + subnet_mask_bits = subnet_mask.split('.') + prefix_length = sum(bin(int(octet)).count('1') for octet in subnet_mask_bits) + cidr_network = f"{ipv4_address}/{prefix_length}" + network = ipaddress.ip_network(cidr_network, strict=False) + range_start = network.network_address + 1 + range_end = network.broadcast_address - 1 + return str(range_start), str(range_end) + + +def get_router_ip(ipv4_address: str, subnet_mask: str) -> str: + network = ipaddress.ip_network(f"{ipv4_address}/{subnet_mask}", strict=False) + router_ip = network.network_address + 1 + return str(router_ip) + + +def subnetConfigFromHostConfig(hc: DhcpdHostConfig) -> DhcpdSubnetConfig: + netmask = "255.255.255.0" + subnet_ip = get_subnet_ip(hc.fixed_address, netmask) + range_start, range_end = get_subnet_range(hc.fixed_address, netmask) + broadcast_address = str(ipaddress.ip_network(f"{hc.fixed_address}/{netmask}", strict=False).broadcast_address) + routers = get_router_ip(hc.fixed_address, netmask) + dns_servers = ["10.2.70.215", "10.11.5.160"] + return DhcpdSubnetConfig(subnet=subnet_ip, netmask=netmask, range_start=range_start, range_end=range_end, broadcast_address=broadcast_address, routers=routers, dns_servers=dns_servers + ) + + +def _convert_to_cidr(ipv4_address: str, subnet_mask: str) -> str: + network = ipaddress.ip_network(f"{ipv4_address}/{subnet_mask}", strict=False) + return str(network) + + +def extract_subnets_from_file(file_path: str) -> list[str]: + subnet_pattern = re.compile(r'subnet (\d+\.\d+\.\d+\.\d+) netmask (\d+\.\d+\.\d+\.\d+)') + with Path(file_path).open('r') as file: + file_contents = file.read() + + subnets = [] + for subnet, netmask in subnet_pattern.findall(file_contents): + subnets.append(_convert_to_cidr(subnet, netmask)) + + return subnets + + +def extract_hostnames_from_file(file_path: str) -> list[str]: + hostnames = [] + + with open(file_path, 'r') as file: + for line in file: + if line.strip().startswith('host'): + match = re.search(r'host\s+(\S+)', line.strip()) + if match: + hostnames.append(match.group(1)) + return hostnames + + +def render_dhcpd_conf(mac: str, ip: str, name: str) -> None: + logger.debug("Rendering dhcpd conf") + file_path = "/etc/dhcp/dhcpd.conf" + hostconfig = DhcpdHostConfig(hostname=name, hardware_ethernet=mac, fixed_address=ip) + subnetconfig = subnetConfigFromHostConfig(hostconfig) + + # If a config already exists, check if it was generated by CDA. + file = Path(file_path) + if file.exists(): + with file.open('r') as f: + line = f.readline() + # If not created by CDA, save as a backup to maintain idempotency + if "Generated by CDA" not in line: + shutil.move(file_path, "/etc/dhcp/dhcpd.conf.cda-backup") + file.touch() + + # Check if the current dhcp config already contains the host or subnet configuration, add a new entry if not + if any(common.ip_in_subnet(hostconfig.fixed_address, subnet) for subnet in extract_subnets_from_file(file_path)): + subnet_config_str = "" + else: + subnet_config_str = f"""# Generated by CDA +subnet {subnetconfig.subnet} netmask {subnetconfig.netmask} {{ + range {subnetconfig.range_start} {subnetconfig.range_end}; + option domain-name-servers {", ".join(subnetconfig.dns_servers)}; + option routers {subnetconfig.routers}; + option broadcast-address {subnetconfig.broadcast_address}; +}} +""" + if hostconfig.hostname in extract_hostnames_from_file(file_path): + host_config_str = "" + else: + host_config_str = f"""# Generated by CDA +host {hostconfig.hostname} {{ + hardware ethernet {hostconfig.hardware_ethernet}; + fixed-address {hostconfig.fixed_address}; + option host-name {hostconfig.hostname}; +}} +""" + + with file.open('a') as f: + f.write(subnet_config_str) + f.write(host_config_str) + + +def configure_dhcpd(node: NodeConfig) -> None: + logger.info(f"Configuring dhcpd entry") + + render_dhcpd_conf(node.mac, node.ip, node.name) + lh = host.LocalHost() + ret = lh.run("systemctl restart dhcpd") + if ret.returncode != 0: + logger.error(f"Failed to restart dhcpd with err: {ret.err}") + sys.exit(-1) + + +def configure_iso_network_port(api_port: str, node_ip: str) -> None: + start, _ = get_subnet_range(node_ip, "255.255.255.0") + lh = host.LocalHost() + logger.info(f"Flushing cluster port {api_port} and setting ip to {start}") + lh.run_or_die(f"ip addr flush dev {api_port}") + lh.run_or_die(f"ip addr add {start}/24 dev {api_port}") + + +def enable_acc_connectivity(node: NodeConfig) -> None: + logger.info(f"Establishing connectivity to {node.name} via {node.bmc}") + ipu_imc = host.RemoteHost(node.bmc) + ipu_imc.ssh_connect(node.bmc_user, node.bmc_password) + ipu_imc.run_or_die("/usr/bin/scripts/cfg_acc_apf_x2.py") + """ + We need to ensure the ACC physical port connectivity is enabled during reboot to ensure dhcp gets an ip. + Trigger an acc reboot and try to run python /usr/bin/scripts/cfg_acc_apf_x2.py. This will fail until the + ACC_LAN_APF_VPORTs are ready. Once this succeeds, we can try to connect to the ACC + """ + logger.info("Rebooting IMC to trigger ACC reboot") + ipu_imc.run("systemctl reboot") + time.sleep(30) + ipu_imc.ssh_connect(node.bmc_user, node.bmc_password) + logger.info(f"Attempting to enable ACC connectivity from IMC {node.bmc} on reboot") + retries = 30 + for _ in range(retries): + ret = ipu_imc.run("/usr/bin/scripts/cfg_acc_apf_x2.py") + if ret == 0: + logger.info("Enabled ACC physical port connectivity") + break + logger.debug(f"ACC SPF script failed with returncode {ret.returncode}") + logger.debug(f"out: {ret.out}\n err: {ret.err}") + time.sleep(15) + else: + logger.error_and_exit(f"Failed to enable ACC connectivity") + + ipu_acc = host.RemoteHost(node.ip) + ipu_acc.ping() + ipu_acc.ssh_connect("root", "redhat") + logger.info(f"{node.name} connectivity established") + + +def is_http_url(url: str) -> bool: + try: + result = urllib.parse.urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + +def _redfish_boot_ipu(cc: ClustersConfig, node: NodeConfig, iso: str) -> None: + def helper(node: NodeConfig) -> str: + logger.info(f"Booting {node.bmc} with {iso_address}") + bmc = host.BMC.from_bmc(node.bmc) + bmc.boot_iso_redfish(iso_path=iso_address, retries=5, retry_delay=15) + + """ + We need to determine that ACC has booted, however ACC will not have connectivity on reboot without manual + intervention on the IMC. As a hack, wait until the installation is likely completed at which point we will try to + establish ACC connectivity + """ + time.sleep(25 * 60) + return f"Finished booting imc {node.bmc}" + + # If an http address is provided, we will boot from here. + # Otherwise we will assume a local file has been provided and host it. + if is_http_url(iso): + logger.debug(f"Booting IPU from iso served at {iso}") + iso_address = iso + + logger.info(helper(node)) + else: + logger.debug(f"Booting IPU from local iso {iso}") + if not os.path.exists(iso): + logger.error(f"ISO file {iso} does not exist, exiting") + sys.exit(-1) + serve_path = os.path.dirname(iso) + iso_name = os.path.basename(iso) + lh = host.LocalHost() + cc.prepare_external_port() + lh_ip = common.port_to_ip(lh, cc.external_port) + + with common.HttpServerManager(serve_path, 8000) as http_server: + iso_address = f"http://{lh_ip}:{str(http_server.port)}/{iso_name}" + logger.info(helper(node)) + + +def IPUIsoBoot(cc: ClustersConfig, node: NodeConfig, iso: str) -> None: + _redfish_boot_ipu(cc, node, iso) + configure_iso_network_port(cc.network_api_port, node.ip) + configure_dhcpd(node) + enable_acc_connectivity(node) + + +def main() -> None: + pass + + +if __name__ == "__main__": + main()