Skip to content

Commit 7b50b19

Browse files
committed
Create a Ray Cluster SDK upgrade scenarios
1 parent 0afa252 commit 7b50b19

File tree

5 files changed

+507
-5
lines changed

5 files changed

+507
-5
lines changed

.github/workflows/olm_tests.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,12 @@ jobs:
122122
BUNDLE_PUSH_OPT: "--tls-verify=false"
123123
CATALOG_PUSH_OPT: "--tls-verify=false"
124124

125-
- name: Run OLM Upgrade e2e AppWrapper creation test
125+
- name: Run OLM Pre Upgrade test scenarios
126126
run: |
127127
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
128128
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
129129
set -euo pipefail
130-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCreateAppWrapper -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
130+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCreateAppWrapper|TestMNISTRayClusterUp' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
131131
132132
- name: Update Operator to the built version
133133
run: |
@@ -158,12 +158,12 @@ jobs:
158158
SUBSCRIPTION_NAME: "codeflare-operator"
159159
SUBSCRIPTION_NAMESPACE: "openshift-operators"
160160

161-
- name: Run OLM Upgrade e2e Appwrapper Job status test to monitor training
161+
- name: Run OLM Post Upgrade test scenarios
162162
run: |
163163
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
164164
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
165165
set -euo pipefail
166-
go test -timeout 30m -v ./test/upgrade -run TestMNISTCheckAppWrapperStatus -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
166+
go test -timeout 30m -v ./test/upgrade -run 'TestMNISTCheckAppWrapperStatus|TestMnistJobSubmit' -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt
167167
168168
- name: Run e2e tests against built operator
169169
run: |

test/e2e/install-codeflare-sdk.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@ poetry config virtualenvs.create false
99

1010
cd codeflare-sdk
1111
# Clone the CodeFlare SDK repository
12-
git clone --branch main https://github.com/project-codeflare/codeflare-sdk.git
12+
#git clone --branch main https://github.com/project-codeflare/codeflare-sdk.git
1313

14+
git clone --branch fix-get-cluster-fun https://github.com/Bobbins228/codeflare-sdk.git
1415
cd codeflare-sdk
16+
git branch
1517

1618
# Lock dependencies and install them
1719
poetry lock --no-update

test/e2e/mnist_rayjob.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import sys
2+
3+
from time import sleep
4+
5+
from torchx.specs.api import AppState, is_terminal
6+
7+
from codeflare_sdk.cluster.cluster import get_cluster
8+
from codeflare_sdk.job.jobs import DDPJobDefinition
9+
10+
namespace = sys.argv[1]
11+
12+
cluster = get_cluster('mnist',namespace)
13+
14+
jobdef = DDPJobDefinition(
15+
name="mnist",
16+
script="mnist.py",
17+
scheduler_args={"requirements": "requirements.txt"},
18+
)
19+
job = jobdef.submit(cluster)
20+
21+
done = False
22+
time = 0
23+
timeout = 900
24+
while not done:
25+
status = job.status()
26+
if is_terminal(status.state):
27+
break
28+
if not done:
29+
print(status)
30+
if timeout and time >= timeout:
31+
raise TimeoutError(f"job has timed out after waiting {timeout}s")
32+
sleep(5)
33+
time += 5
34+
35+
print(f"Job has completed: {status.state}")
36+
37+
print(job.logs())
38+
39+
cluster.down()
40+
41+
if not status.state == AppState.SUCCEEDED:
42+
exit(1)
43+
else:
44+
exit(0)

test/e2e/start_ray_cluster.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import sys
2+
import os
3+
4+
from time import sleep
5+
6+
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
7+
8+
namespace = sys.argv[1]
9+
ray_image = os.getenv('RAY_IMAGE')
10+
host = os.getenv('CLUSTER_HOSTNAME')
11+
12+
ingress_options = {}
13+
if host is not None:
14+
ingress_options = {
15+
"ingresses": [
16+
{
17+
"ingressName": "ray-dashboard",
18+
"port": 8265,
19+
"pathType": "Prefix",
20+
"path": "/",
21+
"host": host,
22+
},
23+
]
24+
}
25+
26+
cluster = Cluster(ClusterConfiguration(
27+
name='mnist',
28+
namespace=namespace,
29+
num_workers=1,
30+
head_cpus='500m',
31+
head_memory=2,
32+
min_cpus='500m',
33+
max_cpus=1,
34+
min_memory=1,
35+
max_memory=2,
36+
num_gpus=0,
37+
instascale=False,
38+
image=ray_image,
39+
ingress_options=ingress_options,
40+
))
41+
42+
cluster.up()
43+
44+
cluster.status()
45+
46+
cluster.wait_ready()
47+
48+
cluster.status()
49+
50+
cluster.details()

0 commit comments

Comments
 (0)