Skip to content

Commit fc4f46d

Browse files
committed
Create a Ray Cluster SDK upgrade scenarios
1 parent 0afa252 commit fc4f46d

File tree

4 files changed

+499
-4
lines changed

4 files changed

+499
-4
lines changed

.github/workflows/olm_tests.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,12 @@ jobs:
122122
BUNDLE_PUSH_OPT: "--tls-verify=false"
123123
CATALOG_PUSH_OPT: "--tls-verify=false"
124124

125-
- name: Run OLM Upgrade e2e AppWrapper creation test
125+
- name: Run OLM Pre Upgrade test scenarios
126126
run: |
127127
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
128128
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
129129
set -euo pipefail
130-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCreateAppWrapper -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
130+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCreateAppWrapper|TestMNISTRayClusterUp' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
131131
132132
- name: Update Operator to the built version
133133
run: |
@@ -158,12 +158,12 @@ jobs:
158158
SUBSCRIPTION_NAME: "codeflare-operator"
159159
SUBSCRIPTION_NAMESPACE: "openshift-operators"
160160

161-
- name: Run OLM Upgrade e2e Appwrapper Job status test to monitor training
161+
- name: Run OLM Post Upgrade test scenarios
162162
run: |
163163
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
164164
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
165165
set -euo pipefail
166-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCheckAppWrapperStatus -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
166+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCheckAppWrapperStatus|TestMnistJobSubmit' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
167167
168168
- name: Run e2e tests against built operator
169169
run: |

test/e2e/mnist_rayjob.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import sys
2+
3+
from time import sleep
4+
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
8+
from codeflare_sdk.job.jobs import DDPJobDefinition
9+
10+
namespace = sys.argv[1]
11+
ray_image = os.getenv('RAY_IMAGE')
12+
host = os.getenv('CLUSTER_HOSTNAME')
13+
14+
ingress_options = {}
15+
if host is not None:
16+
ingress_options = {
17+
"ingresses": [
18+
{
19+
"ingressName": "ray-dashboard",
20+
"port": 8265,
21+
"pathType": "Prefix",
22+
"path": "/",
23+
"host": host,
24+
},
25+
]
26+
}
27+
28+
# cluster = get_cluster('mnist',namespace)
29+
30+
cluster = Cluster(ClusterConfiguration('mnist',namespace,image=ray_image,
31+
ingress_options=ingress_options))
32+
print(cluster.details())
33+
34+
jobdef = DDPJobDefinition(
35+
name="mnist",
36+
script="mnist.py",
37+
scheduler_args={"requirements": "requirements.txt"},
38+
)
39+
job = jobdef.submit(cluster)
40+
41+
done = False
42+
time = 0
43+
timeout = 300
44+
while not done:
45+
status = job.status()
46+
if is_terminal(status.state):
47+
break
48+
if not done:
49+
print(status)
50+
if timeout and time >= timeout:
51+
raise TimeoutError(f"job has timed out after waiting {timeout}s")
52+
sleep(5)
53+
time += 5
54+
55+
print(f"Job has completed: {status.state}")
56+
57+
print(job.logs())
58+
59+
cluster.down()
60+
61+
if not status.state == AppState.SUCCEEDED:
62+
exit(1)
63+
else:
64+
exit(0)

test/e2e/start_ray_cluster.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
8+
9+
namespace = sys.argv[1]
10+
ray_image = os.getenv('RAY_IMAGE')
11+
host = os.getenv('CLUSTER_HOSTNAME')
12+
13+
ingress_options = {}
14+
if host is not None:
15+
ingress_options = {
16+
"ingresses": [
17+
{
18+
"ingressName": "ray-dashboard",
19+
"port": 8265,
20+
"pathType": "Prefix",
21+
"path": "/",
22+
"host": host,
23+
},
24+
]
25+
}
26+
27+
cluster = Cluster(ClusterConfiguration(
28+
name='mnist',
29+
namespace=namespace,
30+
num_workers=1,
31+
head_cpus='500m',
32+
head_memory=2,
33+
min_cpus='500m',
34+
max_cpus=1,
35+
min_memory=1,
36+
max_memory=2,
37+
num_gpus=0,
38+
instascale=False,
39+
image=ray_image,
40+
ingress_options=ingress_options,
41+
))
42+
43+
cluster.up()
44+
45+
cluster.status()
46+
47+
cluster.wait_ready()
48+
49+
cluster.status()
50+
51+
cluster.details()

0 commit comments

Comments
 (0)