From 0b2be3c7de25e8bff92cf12cda262bff5989c6b4 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 1 Sep 2023 13:37:54 -0400 Subject: [PATCH 1/2] Rework SDK priority implementation + updated testing --- src/codeflare_sdk/cluster/cluster.py | 64 ++++---- src/codeflare_sdk/utils/generate_yaml.py | 14 +- tests/test-case-prio.yaml | 197 +++++++++++++++++++++++ tests/test-case.yaml | 3 - tests/unit_test.py | 20 ++- tests/unit_test_support.py | 2 +- 6 files changed, 257 insertions(+), 43 deletions(-) create mode 100644 tests/test-case-prio.yaml diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 7b3d5d667..4d8637667 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -61,38 +61,25 @@ def __init__(self, config: ClusterConfiguration): self.app_wrapper_yaml = self.create_app_wrapper() self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] - def evaluate_config(self): - if not self.evaluate_dispatch_priority(): - return False - else: - return True - def evaluate_dispatch_priority(self): priority_class = self.config.dispatch_priority - if priority_class is None: - return True - else: - try: - config_check() - api_instance = client.CustomObjectsApi(api_config_handler()) - priority_classes = api_instance.list_cluster_custom_object( - group="scheduling.k8s.io", - version="v1", - plural="priorityclasses", - ) - available_priority_classes = [ - i["metadata"]["name"] for i in priority_classes["items"] - ] - except Exception as e: # pragma: no cover - return _kube_api_error_handling(e) - - if priority_class in available_priority_classes: - return True - else: - print( - f"Priority class {priority_class} is not available in the cluster" - ) - return False + + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + priority_classes = api_instance.list_cluster_custom_object( + group="scheduling.k8s.io", + version="v1", + plural="priorityclasses", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for pc in priority_classes["items"]: + if pc["metadata"]["name"] == priority_class: + return pc["value"] + print(f"Priority class {priority_class} is not available in the cluster") + return None def create_app_wrapper(self): """ @@ -109,6 +96,16 @@ def create_app_wrapper(self): f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) + # Before attempting to create the cluster AW, let's evaluate the ClusterConfig + if self.config.dispatch_priority: + priority_val = self.evaluate_dispatch_priority() + if priority_val == None: + raise ValueError( + "Invalid Cluster Configuration, AppWrapper not generated" + ) + else: + priority_val = None + name = self.config.name namespace = self.config.namespace min_cpu = self.config.min_cpus @@ -142,6 +139,7 @@ def create_app_wrapper(self): local_interactive=local_interactive, image_pull_secrets=image_pull_secrets, dispatch_priority=dispatch_priority, + priority_val=priority_val, ) # creates a new cluster with the provided or default spec @@ -150,12 +148,6 @@ def up(self): Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ - - # Before attempting to bring up the cluster let's evaluate the ClusterConfig - if not self.evaluate_config(): - print("Invalid Cluster Configuration") - return False - namespace = self.config.namespace try: config_check() diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index ef343a29a..f128ef8b1 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -89,12 +89,21 @@ def update_labels(yaml, instascale, instance_types): metadata.pop("labels") -def update_priority(item, dispatch_priority): +def update_priority(yaml, item, dispatch_priority, priority_val): + spec = yaml.get("spec") if dispatch_priority is not None: + if priority_val: + spec["priority"] = priority_val + else: + raise ValueError( + "AW generation error: Priority value is None, while dispatch_priority is defined" + ) head = item.get("generictemplate").get("spec").get("headGroupSpec") worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] head["template"]["spec"]["priorityClassName"] = dispatch_priority worker["template"]["spec"]["priorityClassName"] = dispatch_priority + else: + spec.pop("priority") def update_custompodresources( @@ -355,6 +364,7 @@ def generate_appwrapper( local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, + priority_val: int, ): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) @@ -363,7 +373,7 @@ def generate_appwrapper( route_item = resources["resources"].get("GenericItems")[1] update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) - update_priority(item, dispatch_priority) + update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers ) diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml new file mode 100644 index 000000000..c8659efa0 --- /dev/null +++ b/tests/test-case-prio.yaml @@ -0,0 +1,197 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + labels: + orderedinstance: cpu.small_gpu.large + name: prio-test-cluster + namespace: ns +spec: + priority: 10 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + replicas: 2 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: prio-test-cluster + controller-tools.k8s.io: '1.0' + name: prio-test-cluster + namespace: ns + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: prio-test-cluster + operator: In + values: + - prio-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: + - name: unit-test-pull-secret + priorityClassName: default + rayVersion: 2.5.0 + workerGroupSpecs: + - groupName: small-group-prio-test-cluster + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '7' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: prio-test-cluster + operator: In + values: + - prio-test-cluster + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + imagePullSecrets: + - name: unit-test-pull-secret + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + priorityClassName: default + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: prio-test-cluster-head-svc + name: ray-dashboard-prio-test-cluster + namespace: ns + spec: + port: + targetPort: dashboard + to: + kind: Service + name: prio-test-cluster-head-svc + replicas: 1 + Items: [] diff --git a/tests/test-case.yaml b/tests/test-case.yaml index cbfb998e9..9e224667a 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -6,7 +6,6 @@ metadata: name: unit-test-cluster namespace: ns spec: - priority: 9 resources: GenericItems: - custompodresources: @@ -109,7 +108,6 @@ spec: nvidia.com/gpu: 0 imagePullSecrets: - name: unit-test-pull-secret - priorityClassName: default rayVersion: 2.5.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -177,7 +175,6 @@ spec: do echo waiting for myservice; sleep 2; done image: busybox:1.28 name: init-myservice - priorityClassName: default replicas: 1 - generictemplate: apiVersion: route.openshift.io/v1 diff --git a/tests/unit_test.py b/tests/unit_test.py index 28b44b41a..0a2fb555b 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -236,7 +236,7 @@ def test_config_creation(): assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] - assert config.dispatch_priority == "default" + assert config.dispatch_priority == None def test_cluster_creation(): @@ -248,6 +248,23 @@ def test_cluster_creation(): ) +def test_cluster_creation_priority(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", + return_value={"items": [{"metadata": {"name": "default"}, "value": 10}]}, + ) + config = createClusterConfig() + config.name = "prio-test-cluster" + config.dispatch_priority = "default" + cluster = Cluster(config) + assert cluster.app_wrapper_yaml == "prio-test-cluster.yaml" + assert cluster.app_wrapper_name == "prio-test-cluster" + assert filecmp.cmp( + "prio-test-cluster.yaml", f"{parent}/tests/test-case-prio.yaml", shallow=True + ) + + def test_default_cluster_creation(mocker): mocker.patch( "codeflare_sdk.cluster.cluster.get_current_namespace", @@ -2251,6 +2268,7 @@ def test_export_env(): # Make sure to always keep this function last def test_cleanup(): os.remove("unit-test-cluster.yaml") + os.remove("prio-test-cluster.yaml") os.remove("unit-test-default-cluster.yaml") os.remove("test.yaml") os.remove("raytest2.yaml") diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index f060e9cb3..87609dfa2 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -46,7 +46,7 @@ def createClusterConfig(): instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], - dispatch_priority="default", + # dispatch_priority="default", ) return config From 62618d5fa497bc44040ff107ce1b6704c0dec528 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 1 Sep 2023 16:04:54 -0400 Subject: [PATCH 2/2] remove leftover comment --- tests/unit_test_support.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 87609dfa2..a4ea056a0 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -46,7 +46,6 @@ def createClusterConfig(): instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], - # dispatch_priority="default", ) return config