From d582dbafdedda7e4ac819caba275fecf12179f30 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Mon, 9 Sep 2024 15:03:08 +0100 Subject: [PATCH 1/4] feat: split head resources for limits and requests Signed-off-by: Bobbins228 --- src/codeflare_sdk/cluster/cluster.py | 44 +++++++++++++++++------- src/codeflare_sdk/cluster/config.py | 32 +++++++++++++---- src/codeflare_sdk/cluster/model.py | 10 +++--- src/codeflare_sdk/utils/generate_yaml.py | 24 ++++++------- src/codeflare_sdk/utils/pretty_print.py | 2 +- 5 files changed, 77 insertions(+), 35 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index f9bcc84f3..7c652a186 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -462,6 +462,18 @@ def from_k8_cluster_object( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" @@ -851,10 +863,10 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: status=status, # for now we are not using autoscaling so same replicas is fine workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], - worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["memory"], worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ @@ -862,12 +874,18 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: ]["resources"]["limits"]["cpu"], worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], - head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["cpu"], - head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["memory"], + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], + head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"], + head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"], head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -890,14 +908,16 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: name=cluster.config.name, status=cluster.status(print_to_console=False)[0], workers=cluster.config.num_workers, - worker_mem_min=cluster.config.worker_memory_requests, - worker_mem_max=cluster.config.worker_memory_limits, + worker_mem_requests=cluster.config.worker_memory_requests, + worker_mem_limits=cluster.config.worker_memory_limits, worker_cpu=cluster.config.worker_cpu_requests, worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), - head_cpus=cluster.config.head_cpus, - head_mem=cluster.config.head_memory, + head_mem_requests=cluster.config.head_memory_requests, + head_mem_limits=cluster.config.head_memory_limits, + head_cpu_requests=cluster.config.head_cpu_requests, + head_cpu_limits=cluster.config.head_cpu_limits, head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 610d53c44..3c59d593f 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -75,10 +75,16 @@ class ClusterConfiguration: name: str namespace: Optional[str] = None head_info: List[str] = field(default_factory=list) - head_cpus: Union[int, str] = 2 - head_memory: Union[int, str] = 8 + head_cpu_requests: Union[int, str] = 2 + head_cpu_limits: Union[int, str] = 2 + head_cpus: Optional[Union[int, str]] = None # Deprecating + head_memory_requests: Union[int, str] = 8 + head_memory_limits: Union[int, str] = 8 + head_memory: Optional[Union[int, str]] = None # Deprecating head_gpus: Optional[int] = None # Deprecating - head_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + head_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) machine_types: List[str] = field( default_factory=list ) # ["m4.xlarge", "g4dn.xlarge"] @@ -100,7 +106,9 @@ class ClusterConfiguration: write_to_file: bool = False verify_tls: bool = True labels: Dict[str, str] = field(default_factory=dict) - worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + worker_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) extended_resource_mapping: Dict[str, str] = field(default_factory=dict) overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None @@ -183,14 +191,21 @@ def _str_mem_no_unit_add_GB(self): self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): - if isinstance(self.head_memory, int): - self.head_memory = f"{self.head_memory}G" + if isinstance(self.head_memory_requests, int): + self.head_memory_requests = f"{self.head_memory_requests}G" + if isinstance(self.head_memory_limits, int): + self.head_memory_limits = f"{self.head_memory_limits}G" if isinstance(self.worker_memory_requests, int): self.worker_memory_requests = f"{self.worker_memory_requests}G" if isinstance(self.worker_memory_limits, int): self.worker_memory_limits = f"{self.worker_memory_limits}G" def _cpu_to_resource(self): + if self.head_cpus: + warnings.warn( + "head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits" + ) + self.head_cpu_requests = self.head_cpu_limits = self.head_cpus if self.min_cpus: warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") self.worker_cpu_requests = self.min_cpus @@ -199,6 +214,11 @@ def _cpu_to_resource(self): self.worker_cpu_limits = self.max_cpus def _memory_to_resource(self): + if self.head_memory: + warnings.warn( + "head_memory is being deprecated, use head_memory_requests and head_memory_limits" + ) + self.head_memory_requests = self.head_memory_limits = self.head_memory if self.min_memory: warnings.warn("min_memory is being deprecated, use worker_memory_requests") self.worker_memory_requests = f"{self.min_memory}G" diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 5d6e2ed2a..ab7b30ede 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -73,11 +73,13 @@ class RayCluster: name: str status: RayClusterStatus - head_cpus: int - head_mem: str + head_cpu_requests: int + head_cpu_limits: int + head_mem_requests: str + head_mem_limits: str workers: int - worker_mem_min: str - worker_mem_max: str + worker_mem_requests: str + worker_mem_limits: str worker_cpu: int namespace: str dashboard: str diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 1644dc15e..c4e1755d8 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -115,22 +115,22 @@ def update_env(spec, env): def update_resources( spec, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, + cpu_requests, + cpu_limits, + memory_requests, + memory_limits, custom_resources, ): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: - requests["cpu"] = worker_cpu_requests - requests["memory"] = worker_memory_requests + requests["cpu"] = cpu_requests + requests["memory"] = memory_requests limits = resource.get("resources").get("limits") if limits is not None: - limits["cpu"] = worker_cpu_limits - limits["memory"] = worker_memory_limits + limits["cpu"] = cpu_limits + limits["memory"] = memory_limits for k in custom_resources.keys(): limits[k] = custom_resources[k] requests[k] = custom_resources[k] @@ -210,10 +210,10 @@ def update_nodes( # TODO: Eventually add head node configuration outside of template update_resources( spec, - cluster.config.head_cpus, - cluster.config.head_cpus, - cluster.config.head_memory, - cluster.config.head_memory, + cluster.config.head_cpu_requests, + cluster.config.head_cpu_limits, + cluster.config.head_memory_requests, + cluster.config.head_memory_limits, cluster.config.head_extended_resource_requests, ) else: diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 9431ffd75..4842c9cd2 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]): name = cluster.name dashboard = cluster.dashboard workers = str(cluster.workers) - memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" + memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}" cpu = str(cluster.worker_cpu) gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) From 6826915a4ef078123f2ec67db30ba9180c090b9c Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Mon, 9 Sep 2024 15:05:59 +0100 Subject: [PATCH 2/4] test: update unit and e2e tests with split head resources Signed-off-by: Bobbins228 --- .github/workflows/guided_notebook_tests.yaml | 6 ++--- tests/e2e/local_interactive_sdk_kind_test.py | 6 +++-- .../e2e/mnist_raycluster_sdk_aw_kind_test.py | 6 +++-- tests/e2e/mnist_raycluster_sdk_kind_test.py | 6 +++-- tests/e2e/mnist_raycluster_sdk_oauth_test.py | 6 +++-- tests/e2e/start_ray_cluster.py | 6 +++-- tests/unit_test.py | 24 +++++++++++-------- .../raycluster_sdk_upgrade_sleep_test.py | 6 +++-- tests/upgrade/raycluster_sdk_upgrade_test.py | 6 +++-- 9 files changed, 45 insertions(+), 27 deletions(-) diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml index 299c67ef5..eb6c5cd26 100644 --- a/.github/workflows/guided_notebook_tests.yaml +++ b/.github/workflows/guided_notebook_tests.yaml @@ -84,7 +84,7 @@ jobs: jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb # Set explicit namespace as SDK need it (currently) to resolve local queues - sed -i "s/head_memory=2,/head_memory=2, namespace='default',/" 0_basic_ray.ipynb + sed -i "s/head_memory_limits=2,/head_memory_limits=2, namespace='default',/" 0_basic_ray.ipynb # Run notebook poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600 working-directory: demo-notebooks/guided-demos @@ -206,7 +206,7 @@ jobs: JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb # Set explicit namespace as SDK need it (currently) to resolve local queues - sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 1_cluster_job_client.ipynb + sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 1_cluster_job_client.ipynb # Run notebook poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200 working-directory: demo-notebooks/guided-demos @@ -332,7 +332,7 @@ jobs: # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb # Set explicit namespace as SDK need it (currently) to resolve local queues - sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb + sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 2_basic_interactive.ipynb # Add MINIO related modules to runtime environment sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb # Replace markdown cell with remote configuration for MINIO diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index 6693117b3..c20fd8793 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -44,8 +44,10 @@ def run_local_interactives( name=cluster_name, namespace=self.namespace, num_workers=1, - head_cpus="500m", - head_memory=2, + head_cpu_requests="500m", + head_cpu_limits="500m", + head_memory_requests=2, + head_memory_limits=2, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py index 012098a40..d7949b8cb 100644 --- a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_kind( name="mnist", namespace=self.namespace, num_workers=1, - head_cpus="500m", - head_memory=2, + head_cpu_requests="500m", + head_cpu_limits="500m", + head_memory_requests=2, + head_memory_limits=2, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index 2623b36c4..d81e9149b 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_kind( name="mnist", namespace=self.namespace, num_workers=1, - head_cpus="500m", - head_memory=2, + head_cpu_requests="500m", + head_cpu_limits="500m", + head_memory_requests=2, + head_memory_limits=2, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index 3fe6177c6..7e4002adf 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_oauth(self): name="mnist", namespace=self.namespace, num_workers=1, - head_cpus="500m", - head_memory=4, + head_cpu_requests="500m", + head_cpu_limits="500m", + head_memory_requests=4, + head_memory_limits=4, worker_cpu_requests=1, worker_cpu_limits=1, worker_memory_requests=1, diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py index b34f0331f..48ab604ff 100644 --- a/tests/e2e/start_ray_cluster.py +++ b/tests/e2e/start_ray_cluster.py @@ -13,8 +13,10 @@ name="mnist", namespace=namespace, num_workers=1, - head_cpus="500m", - head_memory=2, + head_cpu_requests="500m", + head_cpu_limits="500m", + head_memory_requests=2, + head_memory_limits=2, worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, diff --git a/tests/unit_test.py b/tests/unit_test.py index 111f737c2..388723c50 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -942,13 +942,15 @@ def test_ray_details(mocker, capsys): name="raytest1", status=RayClusterStatus.READY, workers=1, - worker_mem_min="2G", - worker_mem_max="2G", + worker_mem_requests="2G", + worker_mem_limits="2G", worker_cpu=1, namespace="ns", dashboard="fake-uri", - head_cpus=2, - head_mem=8, + head_cpu_requests=2, + head_cpu_limits=2, + head_mem_requests=8, + head_mem_limits=8, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -978,8 +980,8 @@ def test_ray_details(mocker, capsys): assert ray2.name == "raytest2" assert ray1.namespace == ray2.namespace assert ray1.workers == ray2.workers - assert ray1.worker_mem_min == ray2.worker_mem_min - assert ray1.worker_mem_max == ray2.worker_mem_max + assert ray1.worker_mem_requests == ray2.worker_mem_requests + assert ray1.worker_mem_limits == ray2.worker_mem_limits assert ray1.worker_cpu == ray2.worker_cpu assert ray1.worker_extended_resources == ray2.worker_extended_resources try: @@ -2356,13 +2358,15 @@ def test_cluster_status(mocker): name="test", status=RayClusterStatus.UNKNOWN, workers=1, - worker_mem_min=2, - worker_mem_max=2, + worker_mem_requests=2, + worker_mem_limits=2, worker_cpu=1, namespace="ns", dashboard="fake-uri", - head_cpus=2, - head_mem=8, + head_cpu_requests=2, + head_cpu_limits=2, + head_mem_requests=8, + head_mem_limits=8, ) cf = Cluster( ClusterConfiguration( diff --git a/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py b/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py index c415b2fc8..cf21fee06 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_sleep_test.py @@ -53,8 +53,10 @@ def run_mnist_raycluster_sdk_oauth(self): name="mnist", namespace=self.namespace, num_workers=1, - head_cpus=1, - head_memory=4, + head_cpu_requests=1, + head_cpu_limits=1, + head_memory_requests=4, + head_memory_limits=4, worker_cpu_requests=1, worker_cpu_limits=1, worker_memory_requests=4, diff --git a/tests/upgrade/raycluster_sdk_upgrade_test.py b/tests/upgrade/raycluster_sdk_upgrade_test.py index 640b1b5d2..05862d079 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_test.py @@ -48,8 +48,10 @@ def run_mnist_raycluster_sdk_oauth(self): name="mnist", namespace=self.namespace, num_workers=1, - head_cpus=1, - head_memory=4, + head_cpu_requests=1, + head_cpu_limits=1, + head_memory_requests=4, + head_memory_limits=4, worker_cpu_requests=1, worker_cpu_limits=1, worker_memory_requests=4, From ae15cf60d58bf25e2f7c5bf3bbfacf0fe9c7e1e2 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Mon, 9 Sep 2024 15:06:25 +0100 Subject: [PATCH 3/4] docs: update notebooks with split head resources Signed-off-by: Bobbins228 --- demo-notebooks/guided-demos/0_basic_ray.ipynb | 6 ++++-- demo-notebooks/guided-demos/1_cluster_job_client.ipynb | 6 ++++-- demo-notebooks/guided-demos/2_basic_interactive.ipynb | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb index 2a9ae48e4..58a52727b 100644 --- a/demo-notebooks/guided-demos/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb @@ -62,8 +62,10 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest', \n", - " head_cpus='500m',\n", - " head_memory=2,\n", + " head_cpu_requests='500m',\n", + " head_cpu_limits='500m',\n", + " head_memory_requests=2,\n", + " head_memory_limits=2,\n", " head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", " worker_extended_resource_requests={'nvidia.com/gpu':0},\n", " num_workers=2,\n", diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb index 0857042a1..05682d823 100644 --- a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -55,8 +55,10 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='jobtest',\n", - " head_cpus=1,\n", - " head_memory=4,\n", + " head_cpu_requests=1,\n", + " head_cpu_limits=1,\n", + " head_memory_requests=4,\n", + " head_memory_limits=4,\n", " head_extended_resource_requests={'nvidia.com/gpu':1}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", " worker_extended_resource_requests={'nvidia.com/gpu':1},\n", " num_workers=2,\n", diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb index 98fcafa21..1612af3f2 100644 --- a/demo-notebooks/guided-demos/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -60,8 +60,10 @@ "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", " name=cluster_name,\n", - " head_cpus=1,\n", - " head_memory=6,\n", + " head_cpu_requests=1,\n", + " head_cpu_limits=1,\n", + " head_memory_requests=6,\n", + " head_memory_limits=6,\n", " head_extended_resource_requests={'nvidia.com/gpu':1}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", " worker_extended_resource_requests={'nvidia.com/gpu':1},\n", " num_workers=2,\n", From c740864acbd69f67794b7cda604aaf4c3623f5b1 Mon Sep 17 00:00:00 2001 From: Bobbins228 Date: Tue, 10 Sep 2024 14:47:53 +0100 Subject: [PATCH 4/4] docs: update documentation to include depreciating variables Signed-off-by: Bobbins228 --- docs/cluster-configuration.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/cluster-configuration.md b/docs/cluster-configuration.md index 47110e4b1..97068b490 100644 --- a/docs/cluster-configuration.md +++ b/docs/cluster-configuration.md @@ -9,8 +9,10 @@ from codeflare_sdk import Cluster, ClusterConfiguration cluster = Cluster(ClusterConfiguration( name='ray-example', # Mandatory Field namespace='default', # Default None - head_cpus=1, # Default 2 - head_memory=1, # Default 8 + head_cpu_requests=1, # Default 2 + head_cpu_limits=1, # Default 2 + head_memory_requests=1, # Default 8 + head_memory_limits=1, # Default 8 head_extended_resource_requests={'nvidia.com/gpu':0}, # Default 0 worker_extended_resource_requests={'nvidia.com/gpu':0}, # Default 0 num_workers=1, # Default 1 @@ -28,3 +30,17 @@ Note: 'quay.io/modh/ray:2.35.0-py39-cu121' is the default image used by the Code The `labels={"exampleLabel": "example"}` parameter can be used to apply additional labels to the RayCluster resource. After creating their `cluster`, a user can call `cluster.up()` and `cluster.down()` to respectively create or remove the Ray Cluster. + + +## Deprecating Parameters +The following parameters of the `ClusterConfiguration` are being deprecated in release `v0.22.0`. +| Deprecated Parameter | Replaced By | +| :--------- | :-------- | +| `head_cpus` | `head_cpu_requests`, `head_cpu_limits` | +| `head_memory` | `head_memory_requests`, `head_memory_limits` | +| `min_cpus` | `worker_cpu_requests` | +| `max_cpus` | `worker_cpu_limits` | +| `min_memory` | `worker_memory_requests` | +| `max_memory` | `worker_memory_limits` | +| `head_gpus` | `head_extended_resource_requests` | +| `num_gpus` | `worker_extended_resource_requests` |