project-codeflare · openshift-merge-bot · Jul 11, 2024 · Jul 10, 2024 · Jul 9, 2024
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -24,29 +24,15 @@ concurrency:
   group: ${{ github.head_ref }}-${{ github.workflow }}
   cancel-in-progress: true
 
+env:
+  CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
+
 jobs:
   kubernetes:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-4core-gpu
 
     steps:
-      - name: Cleanup
-        run: |
-          ls -lart
-          echo "Initial status:"
-          df -h
-          echo "Cleaning up resources:"
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          docker rmi $(docker image ls -aq)
-          echo "Final status:"
-          df -h
-
       - name: Checkout code
         uses: actions/checkout@v4
         with:
@@ -82,19 +68,23 @@ jobs:
           python-version: '3.9'
           cache: 'pip' # caching pip dependencies
 
+      - name: Setup NVidia GPU environment for KinD
+        uses: ./common/github-actions/nvidia-gpu-setup
+
       - name: Setup and start KinD cluster
         uses: ./common/github-actions/kind
 
+      - name: Install NVidia GPU operator for KinD
+        uses: ./common/github-actions/nvidia-gpu-operator
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
           cd codeflare-operator
           echo Setting up CodeFlare stack
           make setup-e2e
           echo Deploying CodeFlare operator
-          IMG="${REGISTRY_ADDRESS}"/codeflare-operator
-          make image-push -e IMG="${IMG}"
-          make deploy -e IMG="${IMG}" -e ENV="e2e"
+          make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
           cd ..
 
@@ -103,9 +93,6 @@ jobs:
         with:
           user-name: sdk-user
 
-      - name: Add kueue resources
-        run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml"
-
       - name: Configure RBAC for sdk user with limited permissions
         run: |
           kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
@@ -135,7 +122,7 @@ jobs:
           pip install poetry
           poetry install --with test,docs
           echo "Running e2e tests..."
-          poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
+          poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
         env:
           GRPC_DNS_RESOLVER: "native"
 

diff --git a/docs/e2e.md b/docs/e2e.md
@@ -5,6 +5,9 @@
 ## On KinD clusters
 Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72)
 
+If the system you run on contains NVidia GPU then you can enable the GPU support in KinD, this will allow you to run also GPU tests.
+To enable GPU on KinD follow [these instructions](https://www.substratus.ai/blog/kind-with-gpus).
+
 - Setup Phase:
   - Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets:
   ```
@@ -64,9 +67,13 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
    - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
   ```
   poetry install --with test,docs
-  poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py
+  poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py
+  ```
+   - If the cluster doesn't have NVidia GPU support then we need to disable NVidia GPU tests by providing proper marker:
+  ```
+  poetry install --with test,docs
+  poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'kind and not nvidia_gpu'
   ```
-
 
 
 ## On OpenShift clusters
@@ -83,6 +90,10 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
   kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev"
   ```
 
+If the system you run on contains NVidia GPU then you can enable the GPU support on OpenShift, this will allow you to run also GPU tests.
+To enable GPU on OpenShift follow [these instructions](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/introduction.html).
+Currently the SDK doesn't support tolerations, so e2e tests can't be executed on nodes with taint (i.e. GPU taint).
+
 - Test Phase:
    - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository:
   ```
@@ -97,3 +108,8 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127
   ```
   poetry run pytest -v -s ./tests/e2e -m openshift --timeout=1200
   ```
+  - If the cluster doesn't have NVidia GPU support or GPU nodes have taint then we need to disable NVidia GPU tests by providing proper marker:
+  ```
+  poetry install --with test,docs
+  poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'not nvidia_gpu'
+  ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,6 +52,7 @@ filterwarnings = [
 ]
 markers = [
     "kind",
-    "openshift"
+    "openshift",
+    "nvidia_gpu"
 ]
 addopts = "--timeout=900"
diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -27,7 +27,16 @@ def test_local_interactives(self):
         create_kueue_resources(self)
         self.run_local_interactives()
 
-    def run_local_interactives(self):
+    @pytest.mark.nvidia_gpu
+    def test_local_interactives_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_local_interactives(number_of_gpus=1)
+
+    def run_local_interactives(
+        self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster_name = "test-ray-cluster-li"
@@ -43,6 +52,7 @@ def run_local_interactives(self):
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
                 worker_memory_limits=2,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -59,7 +69,7 @@ def run_local_interactives(self):
         ray.shutdown()
         ray.init(address=cluster.local_client_url(), logging_level="DEBUG")
 
-        @ray.remote
+        @ray.remote(num_gpus=number_of_gpus / 2)
         def heavy_calculation_part(num_iterations):
             result = 0.0
             for i in range(num_iterations):
@@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations):
                         result += math.sin(i) * math.cos(j) * math.tan(k)
             return result
 
-        @ray.remote
+        @ray.remote(num_gpus=number_of_gpus / 2)
         def heavy_calculation(num_iterations):
             results = ray.get(
                 [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)]

diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py
@@ -32,6 +32,9 @@
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
 print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
 
+print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
+ACCELERATOR = os.getenv("ACCELERATOR")
+
 
 class LitMNIST(LightningModule):
     def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
@@ -149,7 +152,7 @@ def test_dataloader(self):
 
 # Initialize a trainer
 trainer = Trainer(
-    accelerator="auto",
+    accelerator=ACCELERATOR,
     # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
     max_epochs=3,
     callbacks=[TQDMProgressBar(refresh_rate=20)],

diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
@@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
-        self.run_mnist_raycluster_sdk_kind()
+        self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
-    def run_mnist_raycluster_sdk_kind(self):
+    @pytest.mark.nvidia_gpu
+    def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+    def run_mnist_raycluster_sdk_kind(
+        self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster = Cluster(
@@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self):
                 num_workers=1,
                 head_cpus="500m",
                 head_memory=2,
-                min_cpus="500m",
-                max_cpus=1,
-                min_memory=1,
-                max_memory=2,
-                num_gpus=0,
+                worker_cpu_requests="500m",
+                worker_cpu_limits=1,
+                worker_memory_requests=1,
+                worker_memory_limits=4,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
         cluster.details()
 
-        self.assert_jobsubmit_withoutlogin_kind(cluster)
+        self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
     # Assertions
 
-    def assert_jobsubmit_withoutlogin_kind(self, cluster):
+    def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
         ray_dashboard = cluster.cluster_dashboard_uri()
         client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": accelerator},
             },
+            entrypoint_num_gpus=number_of_gpus,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False

diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py
@@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self):
         self.setup_method()
         create_namespace(self)
         create_kueue_resources(self)
-        self.run_mnist_raycluster_sdk_kind()
+        self.run_mnist_raycluster_sdk_kind(accelerator="cpu")
 
-    def run_mnist_raycluster_sdk_kind(self):
+    @pytest.mark.nvidia_gpu
+    def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
+        self.setup_method()
+        create_namespace(self)
+        create_kueue_resources(self)
+        self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1)
+
+    def run_mnist_raycluster_sdk_kind(
+        self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
+    ):
         ray_image = get_ray_image()
 
         cluster = Cluster(
@@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self):
                 worker_cpu_requests="500m",
                 worker_cpu_limits=1,
                 worker_memory_requests=1,
-                worker_memory_limits=2,
+                worker_memory_limits=4,
+                worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
                 image=ray_image,
                 write_to_file=True,
                 verify_tls=False,
@@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self):
 
         cluster.details()
 
-        self.assert_jobsubmit_withoutlogin_kind(cluster)
+        self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus)
 
     # Assertions
 
-    def assert_jobsubmit_withoutlogin_kind(self, cluster):
+    def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus):
         ray_dashboard = cluster.cluster_dashboard_uri()
         client = RayJobClient(address=ray_dashboard, verify=False)
 
@@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster):
             runtime_env={
                 "working_dir": "./tests/e2e/",
                 "pip": "./tests/e2e/mnist_pip_requirements.txt",
+                "env_vars": {"ACCELERATOR": accelerator},
             },
+            entrypoint_num_gpus=number_of_gpus,
         )
         print(f"Submitted job with ID: {submission_id}")
         done = False

diff --git a/tests/e2e/support.py b/tests/e2e/support.py
@@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor):
                             "resources": [
                                 {"name": "cpu", "nominalQuota": 9},
                                 {"name": "memory", "nominalQuota": "36Gi"},
-                                {"name": "nvidia.com/gpu", "nominalQuota": 0},
+                                {"name": "nvidia.com/gpu", "nominalQuota": 1},
                             ],
                         }
                     ],