microsoft · tohtana · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
@@ -48,7 +48,8 @@ jobs:
 
       - name: Unit tests
         run: |
+          TEST_LOG_FILE="/tmp/test_log_cpu_${GITHUB_RUN_ID}.log"
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.5"
+          RUNNING_TEST_LOG_FILE=${TEST_LOG_FILE} DS_UNITTEST_FILE_STORE_DIR=/dev/shm HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.5"
           HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.5"
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu121, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100] # Modified to run on the test runner
 
     steps:
       - uses: actions/checkout@v4
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning]
+          pip install .[dev,1bit,1bit-mpi,autotuning]
           ds_report
 
       - name: Python environment
@@ -55,5 +55,26 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
+          TEST_LOG_FILE="/tmp/test_log_${GITHUB_RUN_ID}.log"
+          echo "Running tests and logging to ${TEST_LOG_FILE}"
+          # Let this line return true so that we can grep for "Failed" in the log file
+          set +e
+          pytest -s unit/comm/test_dist.py::TestDistInferenceAllReduce
+          NCCL_SOCKET_IFNAME="" DS_UNITTEST_FILE_STORE_DIR=/dev/shm RUNNING_TEST_LOG_FILE=${TEST_LOG_FILE} pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
+          PYTEST_EXIT_CODE=$?
+          if [ $PYTEST_EXIT_CODE -ne 0 ]; then
+            # We don't clean the file here for debugging
+            echo "pytest failed with exit code $PYTEST_EXIT_CODE"
+            exit $PYTEST_EXIT_CODE
+          fi
+          grep "Failed" ${TEST_LOG_FILE}
+          rm -f ${TEST_LOG_FILE}
+          # Do the same as above
+          DS_UNITTEST_FILE_STORE_DIR=/dev/shm RUNNING_TEST_LOG_FILE=${TEST_LOG_FILE} pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"
+          PYTEST_EXIT_CODE=$?
+          grep "Failed" ${TEST_LOG_FILE}
+          if [ $PYTEST_EXIT_CODE -ne 0 ]; then
+            echo "pytest failed with exit code $PYTEST_EXIT_CODE"
+            exit $PYTEST_EXIT_CODE
+          fi
+          rm -f ${TEST_LOG_FILE}
@@ -70,13 +70,47 @@ def pytest_runtest_call(item):
         item.runtest = lambda: True  # Dummy function so test is not run twice
 
 
+def write_to_log_with_lock(log_file_path: str, header: str, msg: str):
+    import fcntl
+    with open(log_file_path, 'a+') as f:
+        try:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            f.write(f"{header} {msg}\n")
+            f.flush()
+        finally:
+            fcntl.flock(f, fcntl.LOCK_UN)
+
+
+dist_test_class = None
+
+
 # We allow DistributedTest to reuse distributed environments. When the last
 # test for a class is run, we want to make sure those distributed environments
 # are destroyed.
 def pytest_runtest_teardown(item, nextitem):
-    if getattr(item.cls, "reuse_dist_env", False) and not nextitem:
+    RUNNING_TEST_LOG_FILE = os.environ.get("RUNNING_TEST_LOG_FILE", "/tmp/running_test.log")
+
+    global dist_test_class
+    # Last test might not have .cls. So we record the pool_cache here
+    if item.cls is not None:
         dist_test_class = item.cls()
+
+    def get_xdist_worker_id():
+        xdist_worker = os.environ.get('PYTEST_XDIST_WORKER', None)
+        if xdist_worker is not None:
+            xdist_worker_id = xdist_worker.replace('gw', '')
+            return int(xdist_worker_id)
+        return None
+
+    if RUNNING_TEST_LOG_FILE:
+        reuse_dist_env = getattr(item.cls, "reuse_dist_env", False)
+        write_to_log_with_lock(RUNNING_TEST_LOG_FILE, f"pytest_runtest_teardown,xdist={get_xdist_worker_id()}",
+                               f"reuse_dist_env={reuse_dist_env} nextitem={nextitem}")
+
+    if not nextitem and dist_test_class is not None and dist_test_class._pool_cache is not None:
         for num_procs, pool in dist_test_class._pool_cache.items():
+            write_to_log_with_lock(RUNNING_TEST_LOG_FILE, f"pytest_runtest_teardown,xdist={get_xdist_worker_id()}",
+                                   f"closing pool num_procs={num_procs} nextitem={nextitem}")
             dist_test_class._close_pool(pool, num_procs, force=True)
 
 

@@ -112,12 +112,7 @@ def test(self, distributed_fixture, class_tmpdir, val1, val2):
 
 class TestDistAllReduce(DistributedTest):
     device_count = get_accelerator().device_count()
-    if device_count >= 4:
-        world_size = [1, 2, 4]
-    elif device_count >= 2:
-        world_size = [1, 2]
-    else:
-        world_size = [1]
+    world_size = 2
 
     def test(self):
         x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
@@ -130,20 +125,17 @@ def test(self):
 @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
 class TestDistInferenceAllReduce(DistributedTest):
     device_count = get_accelerator().device_count()
-    if device_count >= 4:
-        world_size = [1, 2, 4]
-    elif device_count >= 2:
-        world_size = [1, 2]
-    else:
-        world_size = [1]
+    world_size = 2
 
     def test(self, dtype):
         x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
         sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
         result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks
         result = result.to(dtype)
         x = x.to(dtype)
+        print(f"Rank {dist.get_rank()} x: {x}")
         dist.inference_all_reduce(x)
+        print(f"AR Rank {dist.get_rank()} x: {x}")
         assert torch.all(x == result)