Skip to content

Split head memory and cpu requests/limits #579

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/guided_notebook_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ jobs:
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_memory=2,/head_memory=2, namespace='default',/" 0_basic_ray.ipynb
sed -i "s/head_memory_limits=2,/head_memory_limits=2, namespace='default',/" 0_basic_ray.ipynb
# Run notebook
poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600
working-directory: demo-notebooks/guided-demos
Expand Down Expand Up @@ -206,7 +206,7 @@ jobs:
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 1_cluster_job_client.ipynb
sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 1_cluster_job_client.ipynb
# Run notebook
poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200
working-directory: demo-notebooks/guided-demos
Expand Down Expand Up @@ -332,7 +332,7 @@ jobs:
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb
# Set explicit namespace as SDK need it (currently) to resolve local queues
sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb
sed -i "s/head_cpu_limits=1,/head_cpu_limits=1, namespace='default',/" 2_basic_interactive.ipynb
# Add MINIO related modules to runtime environment
sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb
# Replace markdown cell with remote configuration for MINIO
Expand Down
6 changes: 4 additions & 2 deletions demo-notebooks/guided-demos/0_basic_ray.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='raytest', \n",
" head_cpus='500m',\n",
" head_memory=2,\n",
" head_cpu_requests='500m',\n",
" head_cpu_limits='500m',\n",
" head_memory_requests=2,\n",
" head_memory_limits=2,\n",
" head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
" worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
" num_workers=2,\n",
Expand Down
6 changes: 4 additions & 2 deletions demo-notebooks/guided-demos/1_cluster_job_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@
"# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name='jobtest',\n",
" head_cpus=1,\n",
" head_memory=4,\n",
" head_cpu_requests=1,\n",
" head_cpu_limits=1,\n",
" head_memory_requests=4,\n",
" head_memory_limits=4,\n",
" head_extended_resource_requests={'nvidia.com/gpu':1}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
" worker_extended_resource_requests={'nvidia.com/gpu':1},\n",
" num_workers=2,\n",
Expand Down
6 changes: 4 additions & 2 deletions demo-notebooks/guided-demos/2_basic_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,10 @@
"cluster_name = \"interactivetest\"\n",
"cluster = Cluster(ClusterConfiguration(\n",
" name=cluster_name,\n",
" head_cpus=1,\n",
" head_memory=6,\n",
" head_cpu_requests=1,\n",
" head_cpu_limits=1,\n",
" head_memory_requests=6,\n",
" head_memory_limits=6,\n",
" head_extended_resource_requests={'nvidia.com/gpu':1}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n",
" worker_extended_resource_requests={'nvidia.com/gpu':1},\n",
" num_workers=2,\n",
Expand Down
20 changes: 18 additions & 2 deletions docs/cluster-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ from codeflare_sdk import Cluster, ClusterConfiguration
cluster = Cluster(ClusterConfiguration(
name='ray-example', # Mandatory Field
namespace='default', # Default None
head_cpus=1, # Default 2
head_memory=1, # Default 8
head_cpu_requests=1, # Default 2
head_cpu_limits=1, # Default 2
head_memory_requests=1, # Default 8
head_memory_limits=1, # Default 8
head_extended_resource_requests={'nvidia.com/gpu':0}, # Default 0
worker_extended_resource_requests={'nvidia.com/gpu':0}, # Default 0
num_workers=1, # Default 1
Expand All @@ -28,3 +30,17 @@ Note: 'quay.io/modh/ray:2.35.0-py39-cu121' is the default image used by the Code
The `labels={"exampleLabel": "example"}` parameter can be used to apply additional labels to the RayCluster resource.

After creating their `cluster`, a user can call `cluster.up()` and `cluster.down()` to respectively create or remove the Ray Cluster.


## Deprecating Parameters
The following parameters of the `ClusterConfiguration` are being deprecated in release `v0.22.0`. <!-- TODO: When removing deprecated parameters update this statement -->
| Deprecated Parameter | Replaced By |
| :--------- | :-------- |
| `head_cpus` | `head_cpu_requests`, `head_cpu_limits` |
| `head_memory` | `head_memory_requests`, `head_memory_limits` |
| `min_cpus` | `worker_cpu_requests` |
| `max_cpus` | `worker_cpu_limits` |
| `min_memory` | `worker_memory_requests` |
| `max_memory` | `worker_memory_limits` |
| `head_gpus` | `head_extended_resource_requests` |
| `num_gpus` | `worker_extended_resource_requests` |
44 changes: 32 additions & 12 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,18 @@ def from_k8_cluster_object(
name=rc["metadata"]["name"],
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
Expand Down Expand Up @@ -851,23 +863,29 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
status=status,
# for now we are not using autoscaling so same replicas is fine
workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
worker_extended_resources=worker_extended_resources,
namespace=rc["metadata"]["namespace"],
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["requests"]["cpu"],
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["cpu"],
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["requests"]["memory"],
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
0
]["resources"]["limits"]["memory"],
head_extended_resources=head_extended_resources,
dashboard=dashboard_url,
)
Expand All @@ -890,14 +908,16 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
name=cluster.config.name,
status=cluster.status(print_to_console=False)[0],
workers=cluster.config.num_workers,
worker_mem_min=cluster.config.worker_memory_requests,
worker_mem_max=cluster.config.worker_memory_limits,
worker_mem_requests=cluster.config.worker_memory_requests,
worker_mem_limits=cluster.config.worker_memory_limits,
worker_cpu=cluster.config.worker_cpu_requests,
worker_extended_resources=cluster.config.worker_extended_resource_requests,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_mem_requests=cluster.config.head_memory_requests,
head_mem_limits=cluster.config.head_memory_limits,
head_cpu_requests=cluster.config.head_cpu_requests,
head_cpu_limits=cluster.config.head_cpu_limits,
head_extended_resources=cluster.config.head_extended_resource_requests,
)
if ray.status == CodeFlareClusterStatus.READY:
Expand Down
32 changes: 26 additions & 6 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,16 @@ class ClusterConfiguration:
name: str
namespace: Optional[str] = None
head_info: List[str] = field(default_factory=list)
head_cpus: Union[int, str] = 2
head_memory: Union[int, str] = 8
head_cpu_requests: Union[int, str] = 2
head_cpu_limits: Union[int, str] = 2
head_cpus: Optional[Union[int, str]] = None # Deprecating
head_memory_requests: Union[int, str] = 8
head_memory_limits: Union[int, str] = 8
head_memory: Optional[Union[int, str]] = None # Deprecating
head_gpus: Optional[int] = None # Deprecating
head_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
head_extended_resource_requests: Dict[str, Union[str, int]] = field(
default_factory=dict
)
machine_types: List[str] = field(
default_factory=list
) # ["m4.xlarge", "g4dn.xlarge"]
Expand All @@ -100,7 +106,9 @@ class ClusterConfiguration:
write_to_file: bool = False
verify_tls: bool = True
labels: Dict[str, str] = field(default_factory=dict)
worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
worker_extended_resource_requests: Dict[str, Union[str, int]] = field(
default_factory=dict
)
extended_resource_mapping: Dict[str, str] = field(default_factory=dict)
overwrite_default_resource_mapping: bool = False
local_queue: Optional[str] = None
Expand Down Expand Up @@ -183,14 +191,21 @@ def _str_mem_no_unit_add_GB(self):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _memory_to_string(self):
if isinstance(self.head_memory, int):
self.head_memory = f"{self.head_memory}G"
if isinstance(self.head_memory_requests, int):
self.head_memory_requests = f"{self.head_memory_requests}G"
if isinstance(self.head_memory_limits, int):
self.head_memory_limits = f"{self.head_memory_limits}G"
if isinstance(self.worker_memory_requests, int):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if isinstance(self.worker_memory_limits, int):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _cpu_to_resource(self):
if self.head_cpus:
warnings.warn(
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
)
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
if self.min_cpus:
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
self.worker_cpu_requests = self.min_cpus
Expand All @@ -199,6 +214,11 @@ def _cpu_to_resource(self):
self.worker_cpu_limits = self.max_cpus

def _memory_to_resource(self):
if self.head_memory:
warnings.warn(
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
)
self.head_memory_requests = self.head_memory_limits = self.head_memory
if self.min_memory:
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
self.worker_memory_requests = f"{self.min_memory}G"
Expand Down
10 changes: 6 additions & 4 deletions src/codeflare_sdk/cluster/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ class RayCluster:

name: str
status: RayClusterStatus
head_cpus: int
head_mem: str
head_cpu_requests: int
head_cpu_limits: int
head_mem_requests: str
head_mem_limits: str
workers: int
worker_mem_min: str
worker_mem_max: str
worker_mem_requests: str
worker_mem_limits: str
worker_cpu: int
namespace: str
dashboard: str
Expand Down
24 changes: 12 additions & 12 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,22 +115,22 @@ def update_env(spec, env):

def update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
cpu_requests,
cpu_limits,
memory_requests,
memory_limits,
custom_resources,
):
container = spec.get("containers")
for resource in container:
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = worker_cpu_requests
requests["memory"] = worker_memory_requests
requests["cpu"] = cpu_requests
requests["memory"] = memory_requests
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = worker_cpu_limits
limits["memory"] = worker_memory_limits
limits["cpu"] = cpu_limits
limits["memory"] = memory_limits
for k in custom_resources.keys():
limits[k] = custom_resources[k]
requests[k] = custom_resources[k]
Expand Down Expand Up @@ -210,10 +210,10 @@ def update_nodes(
# TODO: Eventually add head node configuration outside of template
update_resources(
spec,
cluster.config.head_cpus,
cluster.config.head_cpus,
cluster.config.head_memory,
cluster.config.head_memory,
cluster.config.head_cpu_requests,
cluster.config.head_cpu_limits,
cluster.config.head_memory_requests,
cluster.config.head_memory_limits,
cluster.config.head_extended_resource_requests,
)
else:
Expand Down
2 changes: 1 addition & 1 deletion src/codeflare_sdk/utils/pretty_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
name = cluster.name
dashboard = cluster.dashboard
workers = str(cluster.workers)
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}"
cpu = str(cluster.worker_cpu)
gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0))

Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@ def run_local_interactives(
name=cluster_name,
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_kind(
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_kind(
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@ def run_mnist_raycluster_sdk_oauth(self):
name="mnist",
namespace=self.namespace,
num_workers=1,
head_cpus="500m",
head_memory=4,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=4,
head_memory_limits=4,
worker_cpu_requests=1,
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
6 changes: 4 additions & 2 deletions tests/e2e/start_ray_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
name="mnist",
namespace=namespace,
num_workers=1,
head_cpus="500m",
head_memory=2,
head_cpu_requests="500m",
head_cpu_limits="500m",
head_memory_requests=2,
head_memory_limits=2,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
Expand Down
Loading