From 9c593619f363fda422375b386674d558fa48d437 Mon Sep 17 00:00:00 2001
From: Neo Zhang <NA>
Date: Wed, 3 Jul 2024 11:20:54 +0800
Subject: [PATCH] fix multiple gpu, add device choose mode, update the guide
 for usages

---
 README-sycl.md                     |  81 ++++--
 examples/sycl/CMakeLists.txt       |   4 +
 examples/sycl/win-run-llama2.bat   |   2 +-
 ggml/include/ggml-sycl.h           |   4 +-
 ggml/src/ggml-sycl.cpp             | 332 +++++----------------
 ggml/src/ggml-sycl/common.cpp      | 448 +++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/common.hpp      | 261 ++++-------------
 ggml/src/ggml-sycl/dpct/helper.hpp |  78 ++---
 src/llama.cpp                      |  11 +-
 9 files changed, 704 insertions(+), 517 deletions(-)
diff --git a/README-sycl.md b/README-sycl.md
index 885983e92277e..e7b3d7752d35b 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -296,15 +296,25 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID|        Device Type| Ver|                                   Name|Global mem size|
+|--|-------------------|----|---------------------------------------|---------------|
+| 0| [level_zero:gpu:0]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 1| [level_zero:gpu:1]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 2|     [opencl:gpu:0]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 3|     [opencl:gpu:1]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 4|     [opencl:cpu:0]| 3.0|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5|     [opencl:acc:0]| 1.2|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|
+
 ```
 
 | Attribute              | Note                                                        |
@@ -477,15 +487,24 @@ build\bin\ls-sycl-device.exe
 The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
 found 6 SYCL devices:
-|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
-|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
-| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
+Part1:
+|ID|        Device Type| Ver|                                   Name|Global mem size|
+|--|-------------------|----|---------------------------------------|---------------|
+| 0| [level_zero:gpu:0]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 1| [level_zero:gpu:1]| 1.3|         Intel Data Center GPU Flex 170|         16225M|
+| 2|     [opencl:gpu:0]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 3|     [opencl:gpu:1]| 3.0|         Intel Data Center GPU Flex 170|         16225M|
+| 4|     [opencl:cpu:0]| 3.0|     Intel Xeon Gold 6346 CPU @ 3.10GHz|        540700M|
+| 5|     [opencl:acc:0]| 1.2|            Intel FPGA Emulation Device|        540700M|
+Part2:
+|ID|Max compute units|Max work group|Max subgroup|                    Driver version|
+|--|-----------------|--------------|------------|----------------------------------|
+| 0|              512|          1024|          32|                         1.3.27642|
+| 1|              512|          1024|          32|                         1.3.27642|
+| 2|              512|          1024|          32|                    23.43.27642.40|
+| 3|              512|          1024|          32|                    23.43.27642.40|
+| 4|               64|          8192|          64|2024.17.5.0.08_160000.xmain-hotfix|
+| 5|               64|      67108864|          64|2024.17.5.0.08_160000.xmain-hotfix|
 
 ```
 
@@ -556,6 +575,32 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| GGML_SYCL_VISIBLE_DEVICES|id1,id2,...|It's like `CUDA_VISIBLE_DEVICES`, define the SYCL device ID list to visible. Like "0", "0,2", "2,1" |
+| ONEAPI_DEVICE_SELECTOR|Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector)|be used to limit the choice of devices available when the SYCL-using application is run|
+
+##### Choose SYCL Devices in Running Time
+
+In SYCL running time, a physical device could be mapped to two logical devices on different running times: Level-Zero and OpenCL. So it will show more devices in SYCL view. But we need avoid to run code on these two logical devices on same physical device in same time.
+
+The SCYL backend supports dGPU or iGPU in same machine.
+
+##### SYCL Backend Rule:
+
+|Mode|Explain|Example|Recommend Cases|Note|
+|-|-|-|-|-|
+|Normal|Use all powest devices. Default mode. No special setting.<br>SYCL backend will detect and choose the **Level-Zero** devices which have top `Max compute units`.<br> ||Most cases of normal user.||
+|Advanced|Allow user choose one or more SYCL devices which could be Level-Zero or OpenCL or both.<br>Set the device list by environment variable: **GGML_SYCL_VISIBLE_DEVICES**, like `CUDA_VISIBLE_DEVICES`.<br>SYCL backend will choose all devices by it.| `set/export GGML_SYCL_VISIBLE_DEVICES=1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=0,1`<br>`set/export GGML_SYCL_VISIBLE_DEVICES=2,1`|Use iGPU or both in dGPU + iGPU environment<br>Use a dGPU in mulitple dGPU environment.<br>Use one or more OpenCL devices|There is known issue of OpenCL device. WIP.|
+|Developer|Allow SYCL developer choose one or more SYCL devices by environment varibale **ONEAPI_DEVICE_SELECTOR** with flexiable grammar.<br>Refer to [oneapi-device-selector](https://intel.github.io/llvm-docs/EnvironmentVariables.html#oneapi-device-selector).|`set/export ONEAPI_DEVICE_SELECTOR=level_zero:1`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:*`<br>`set/export ONEAPI_DEVICE_SELECTOR=opencl:gpu;level_zero:gpu`<br>|Cover the Advanced mode. It will impact **Normal** and **Advanced** modes as low level principle.<br>Flexiable grammar support more complex device environments.|There is known issue of OpenCL device. WIP.|
+
+##### Parameters of Llama.cpp
+
+The parameters about device choose of llama.cpp works with SYCL backend rule to decide the final result. User could use one or all chosen devices by SYCL backend rule.
+
+|Device|Values|Note|
+|-|-|-|
+|Single Device|`--split-mode=none` and `--main-gpu=id`|The value of `main-gpu` must be in the chosen device lists printed out during llama.cpp startup. Like:<br>`detect 2 SYCL level-zero GPUs:[0,1]`.<br>`main-gpu` should be set to `0` or `1`.|
+|Multiple Device|`--split-mode=layer`|Default|
+
 
 ## Known Issues
 
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index e4d5083e6e502..ab26691fe7863 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -2,6 +2,10 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
+add_compile_options(-I${PROJECT_SOURCE_DIR}/ggml)
+add_compile_options(-I${PROJECT_SOURCE_DIR}/ggml/src)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl")
+
 set(TARGET llama-ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat
index 1d4d7d2cdcb6f..45414863c2dd4 100644
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -6,6 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 
 
-.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
 
 
diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h
index 652de95209da6..a50086afae9eb 100644
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -35,11 +35,9 @@ GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
 GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);
 
-// GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
-// GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
-
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 68ebe8bf939e4..71e2b59ec36bb 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -39,7 +39,7 @@
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"
 
-bool   ggml_sycl_loaded(void);
+
 void   ggml_sycl_free_data(struct ggml_tensor * tensor);
 void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
 void   ggml_sycl_set_main_device(int main_device);
@@ -48,92 +48,7 @@ void   ggml_sycl_get_device_description(int device, char * description, size_t d
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
 static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);
-static inline int get_sycl_env(const char *env_name, int default_val);
-static inline int get_work_group_size(const sycl::device& device);
-
-static bool g_sycl_loaded = false;
-
-bool ggml_sycl_loaded(void) {
-    return g_sycl_loaded;
-}
-
-int get_sycl_env(const char *env_name, int default_val) {
-    char *user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL &&
-        sscanf(user_device_string, " %u", &n) == 1) {
-        user_number = (int)n;
-    } else {
-        user_number = default_val;
-    }
-    return user_number;
-}
-
-static ggml_sycl_device_info ggml_sycl_init() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
-
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
-                g_ggml_sycl_debug);
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
-#endif
-
-#if defined(GGML_SYCL_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
-#endif
 
-#if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                                dpct::dev_mgr::instance().device_count()) !=
-            0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-        initialized = true;
-        g_sycl_loaded = true;
-    }
-
-    static ggml_sycl_device_info info = {};
-    info.refresh_device(SYCL_MUL_GPU_MODE);
-
-    if (info.device_count == 0) {
-        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n",
-                __func__);
-        return info;
-    }
-    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
-
-    return info;
-} catch (sycl::exception const &exc) {
-    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-              << ", line:" << __LINE__ << std::endl;
-    std::exit(1);
-}
-
-
-ggml_sycl_device_info &ggml_sycl_info() {
-    static ggml_sycl_device_info info = ggml_sycl_init();
-    return info;
-}
 
 void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
                     const void *ptr_src, size_t size) {
@@ -2099,125 +2014,17 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
     }
 }
 
-
-void print_device_detail(int id, sycl::device &device, std::string device_type) {
-
-    dpct::device_info prop;
-    SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, device)));
-
-    std::string version;
-    version += std::to_string(prop.get_major_version());
-    version += ".";
-    version += std::to_string(prop.get_minor_version());
-
-    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
-    std::string name = std::string(prop.get_name());
-    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
-    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
-
-    auto global_mem_size = prop.get_global_mem_size()/1000000;
-
-    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
-            name.c_str(), version.c_str(), prop.get_max_compute_units(),
-            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
-}
-
-void ggml_backend_sycl_print_sycl_devices() {
-    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
-    int device_count = dpct::dev_mgr::instance().device_count();
-    std::map<std::string, size_t> DeviceNums;
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
-    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
-    for (int id = 0; id < device_count; ++id) {
-        sycl::device device = dpct::dev_mgr::instance().get_device(id);
-        sycl::backend backend = device.get_backend();
-        std::string backend_type = get_device_backend_and_type(device);
-        int type_id=DeviceNums[backend_type]++;
-        std::stringstream device_type;
-        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
-        print_device_detail(id, device, device_type.str());
-    }
-}
-
-static inline int get_sycl_env(const char *env_name, int default_val) {
-    char *user_device_string = getenv(env_name);
-    int user_number = default_val;
-
-    unsigned n;
-    if (user_device_string != NULL &&
-        sscanf(user_device_string, " %u", &n) == 1) {
-        user_number = (int)n;
-    } else {
-        user_number = default_val;
+inline void check_allow_device_id(const int device_id) {
+    if (ggml_sycl_info().device_count<1) {
+        fprintf(stderr, "%s: not detect any SYCL devices, check GPU driver or unset GGML_SYCL_VISIBLE_DEVICES and ONEAPI_DEVICE_SELECTOR\n", __func__);
+        exit(1);
     }
-    return user_number;
-}
-
-static void ggml_check_sycl() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-        fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
-        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-
-        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
-
-#if defined(GGML_SYCL_F16)
-        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
-#endif
-
-/* NOT REMOVE, keep it for next optimize for XMX.
-#if defined(SYCL_USE_XMX)
-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
-#endif
-*/
-
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-        initialized = true;
-        g_sycl_loaded = true;
+    if (!ggml_sycl_info().is_allowed_device(device_id)) {
+        fprintf(stderr, "%s: device_id:%d is out of range [%s]. To use any SYCL devices, set/export GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n",
+            __func__, device_id, ggml_sycl_info().devices_list());
+        exit_with_stack_print();
     }
 }
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-GGML_API GGML_CALL void ggml_sycl_set_single_device(int device_id) {
-    ggml_sycl_info().refresh_device(SYCL_SINGLE_GPU_MODE, device_id);
-    ggml_sycl_set_main_device(device_id);
-}
-
-inline void check_allow_device_id(const int device_id) {
-  if (device_id >= ggml_sycl_info().device_count) {
-    char error_buf[256];
-    snprintf(
-        error_buf,
-        sizeof(error_buf),
-        "%s error: device_id:%d is out of range: [0-%d]",
-        __func__,
-        device_id,
-        ggml_sycl_info().device_count - 1);
-    fprintf(stderr, "%s\n", error_buf);
-    assert(false);
-  }
-}
 
 // buffer pool for sycl (legacy)
 struct ggml_sycl_pool_leg : public ggml_sycl_pool {
@@ -2735,8 +2542,9 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
 static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
     int64_t min_compute_capability = INT_MAX;
     int64_t max_compute_capability = INT_MIN;
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
-        if (tensor_split[id] < (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
+        if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
             if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
                 min_compute_capability = ggml_sycl_info().devices[id].cc;
             }
@@ -3163,14 +2971,17 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
     }
 
 #ifdef NDEBUG
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         SYCL_CHECK(ggml_sycl_set_device(id));
     }
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         SYCL_CHECK(ggml_sycl_set_device(id));
 
-        for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) {
+        for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
+            int id_other = ggml_backend_sycl_get_device_id(i_other);
             if (id == id_other) {
                 continue;
             }
@@ -3271,7 +3082,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
     int used_devices = 0;
     queue_ptr main_stream = ctx.stream();
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         // by default, use all rows
         dev[id].row_low  = 0;
         dev[id].row_high = ne01;
@@ -3281,15 +3093,15 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         if (split) {
             const int64_t rounding = get_row_rounding(src0->type, tensor_split);
 
-            if (id != 0) {
-                dev[id].row_low  = ne01*tensor_split[id];
+            if (i != 0) {
+                dev[id].row_low  = ne01*tensor_split[i];
                 if (dev[id].row_low < ne01) {
                     dev[id].row_low -= dev[id].row_low % rounding;
                 }
             }
 
-            if (id != ggml_sycl_info().device_count - 1) {
-                dev[id].row_high  = ne01*tensor_split[id + 1];
+            if (i != ggml_sycl_info().device_count - 1) {
+                dev[id].row_high  = ne01*tensor_split[i + 1];
                 if (dev[id].row_high < ne01) {
                     dev[id].row_high -= dev[id].row_high % rounding;
                 }
@@ -3297,7 +3109,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         }
     }
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
             continue;
         }
@@ -3363,7 +3176,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            int id = ggml_backend_sycl_get_device_id(i);
             if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -3505,7 +3319,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
         is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;
 
         ggml_sycl_set_device(ctx.device);
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            int id = ggml_backend_sycl_get_device_id(i);
             if (dev[id].row_low == dev[id].row_high) {
                 continue;
             }
@@ -3922,9 +3737,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
     if (split) {
         ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
         auto & tensor_split = buft_ctx->tensor_split;
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            int id = ggml_backend_sycl_get_device_id(i);
             // skip devices that are not going to do any work:
-            if (tensor_split[id] >= (id + 1 < ggml_sycl_info().device_count ? tensor_split[id + 1] : 1.0f)) {
+            if (tensor_split[i] >= (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
                 continue;
             }
 
@@ -4361,7 +4177,6 @@ catch (sycl::exception const &exc) {
 }
 
 bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * tensor) {
-    if (!g_sycl_loaded) return false;
 
     ggml_sycl_func_t func;
 
@@ -4502,13 +4317,19 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
     return true;
 }
 
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index) {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n");
+    return ggml_sycl_info().get_device_id(index);
+}
+
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
     GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
     for(int i=0;i<max_len;i++) id_list[i] = -1;
 
-    for (int id=0;id< ggml_sycl_info().device_count;id++){
-        if (id>=max_len) break;
-        id_list[id] = id;
+    for (int i=0;i< ggml_sycl_info().device_count;i++){
+        if (i>=max_len) break;
+        int id = ggml_backend_sycl_get_device_id(i);
+        id_list[i] = id;
     }
     return;
 }
@@ -4868,17 +4689,15 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
 
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
-    if (device_id>=ggml_sycl_info().device_count or device_id<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device_id, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device_id<ggml_sycl_info().device_count);
-    }
+    check_allow_device_id(device_id);
+
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int id = 0; id < ggml_sycl_info().device_count; id++) {
+        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+            int id = ggml_backend_sycl_get_device_id(i);
             auto & device = dpct::dev_mgr::instance().get_device(id);
             queue_ptr stream = &(device.default_queue());
             ggml_backend_sycl_buffer_types[id] = {
@@ -4894,18 +4713,15 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
 
-    int device_id = ctx->device;
-    if (device_id>=ggml_sycl_info().device_count or device_id<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_id:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
-            device_id, ggml_sycl_info().device_count-1);
-        GGML_ASSERT(device_id<ggml_sycl_info().device_count);
-    }
+    check_allow_device_id(ctx->device);
+
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
     static bool ggml_backend_sycl_buffer_type_initialized = false;
 
     if (!ggml_backend_sycl_buffer_type_initialized) {
-        for (int id = 0; id < ggml_sycl_info().device_count; id++) {
+        for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+            int id = ggml_backend_sycl_get_device_id(i);
             ggml_backend_sycl_buffer_types[id] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
@@ -4913,20 +4729,20 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
         }
         ggml_backend_sycl_buffer_type_initialized = true;
     }
-    return &ggml_backend_sycl_buffer_types[device_id];
+    return &ggml_backend_sycl_buffer_types[ctx->device];
 }
 
 // sycl split buffer type
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int id) {
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split, int i) {
     const int64_t nrows = ggml_nrows(tensor);
     const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
 
-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low = i == 0 ? 0 : nrows*tensor_split[i];
     *row_low -= *row_low % rounding;
-    if (id == ggml_sycl_info().device_count - 1) {
+    if (i == ggml_sycl_info().device_count - 1) {
         *row_high = nrows;
     } else {
-        *row_high = nrows*tensor_split[id + 1];
+        *row_high = nrows*tensor_split[i + 1];
         *row_high -= *row_high % rounding;
     }
 }
@@ -4934,7 +4750,8 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
 struct ggml_backend_sycl_split_buffer_context {
     ~ggml_backend_sycl_split_buffer_context() try {
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+                int id = ggml_backend_sycl_get_device_id(i);
                 for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
                     if (extra->events[id][is] != nullptr) {
                         /*
@@ -5009,9 +4826,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
     ctx->tensor_extras.push_back(extra);
         ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5088,9 +4906,10 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
     const size_t nb1 = tensor->nb[1];
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5141,9 +4960,10 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
     const size_t nb1 = tensor->nb[1];
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        int id = ggml_backend_sycl_get_device_id(i);
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5224,9 +5044,9 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
 
     const int64_t ne0 = tensor->ne[0];
 
-    for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i);
 
         int64_t nrows_split = row_high - row_low;
         if (nrows_split == 0) {
@@ -5264,7 +5084,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
     std::lock_guard<std::mutex> lock(mutex);
 
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n");
-    ggml_check_sycl();
+
     // FIXME: this is not thread safe
     static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
 
@@ -5275,12 +5095,12 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
         tensor_split_arr = ggml_sycl_info().default_tensor_split;
     } else {
         float split_sum = 0.0f;
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
-            tensor_split_arr[id] = split_sum;
-            split_sum += tensor_split[id];
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] = split_sum;
+            split_sum += tensor_split[i];
         }
-        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
-            tensor_split_arr[id] /= split_sum;
+        for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+            tensor_split_arr[i] /= split_sum;
         }
     }
 
@@ -5381,6 +5201,7 @@ GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
 
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
+
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
         (char *)tensor->data + offset, data, size).wait()));
 }
@@ -5682,7 +5503,8 @@ extern "C" int ggml_backend_sycl_reg_devices();
 
 int ggml_backend_sycl_reg_devices() {
     assert(ggml_sycl_info().device_count>0);
-    for (int id = 0; id < ggml_sycl_info().device_count; id++) {
+    for (int i = 0; i < ggml_sycl_info().device_count; i++) {
+        int id = ggml_backend_sycl_get_device_id(i);
         char name[128];
         snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
         ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index e878f4f50f09e..c0214329998b8 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -51,3 +51,451 @@ void ggml_sycl_host_free(void* ptr) try {
             << ", line:" << __LINE__ << std::endl;
   std::exit(1);
 }
+
+static inline int get_sycl_env(const char *env_name, int default_val) {
+    char *user_device_string = getenv(env_name);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL &&
+        sscanf(user_device_string, " %u", &n) == 1) {
+        user_number = (int)n;
+    } else {
+        user_number = default_val;
+    }
+    return user_number;
+}
+
+static inline bool env_existed(const char *env_name) {
+     char *user_device_string = getenv(env_name);
+     return user_device_string!=NULL;
+}
+
+static std::vector<int> get_sycl_visible_devices() {
+    static std::vector<int> device_ids;
+    char *devices_env = getenv("GGML_SYCL_VISIBLE_DEVICES");
+    if (devices_env != nullptr) {
+        std::string devices(devices_env);
+        std::replace(devices.begin(), devices.end(), ',', ' ');
+
+        std::stringstream ss(devices);
+        int tmp;
+        while (ss >> tmp) {
+            device_ids.push_back(tmp);
+        }
+    }
+    return device_ids;
+}
+
+void print_device_detail_part1(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    std::string version;
+    version += std::to_string(prop.get_major_version());
+    version += ".";
+    version += std::to_string(prop.get_minor_version());
+
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+    std::string name = std::string(prop.get_name());
+    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
+    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
+
+    auto global_mem_size = prop.get_global_mem_size()/1000000;
+
+    fprintf(stderr, "|%2d|%19s|%4s|%39s|%14luM|\n", id, device_type.c_str(), version.c_str(),
+        name.c_str(), global_mem_size);
+}
+
+void print_device_detail_part2(int id, sycl::device &device, std::string device_type) {
+
+    dpct::device_info prop;
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        dpct::get_device_info(prop, device)));
+
+    fprintf(stderr, "|%2d|%17d|%14d|%12d|%34s|\n", id,
+        prop.get_max_compute_units(),
+        prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
+        device.get_info<sycl::info::device::driver_version>().c_str());
+}
+
+void ggml_backend_sycl_print_sycl_devices() {
+    GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
+    int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
+    fprintf(stderr, "found %d SYCL devices:\n", device_count);
+    fprintf(stderr, "Part1:\n");
+    fprintf(stderr, "|ID|        Device Type| Ver|                                   Name|Global mem size|\n");
+    fprintf(stderr, "|--|-------------------|----|---------------------------------------|---------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail_part1(id, device, device_type.str());
+    }
+
+    std::map<std::string, size_t> DeviceNums2;
+    fprintf(stderr, "\nPart2:\n");
+    fprintf(stderr, "|ID|Max compute units|Max work group|Max subgroup|                    Driver version|\n");
+    fprintf(stderr, "|--|-----------------|--------------|------------|----------------------------------|\n");
+    for (int id = 0; id < device_count; ++id) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums2[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail_part2(id, device, device_type.str());
+    }
+}
+
+static ggml_sycl_device_info ggml_sycl_init() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+        fprintf(stderr, "[SYCL] call ggml_init_sycl\n");
+
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
+        fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__,
+                g_ggml_sycl_debug);
+
+#if defined(GGML_SYCL_F16)
+        fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
+#endif
+
+#if defined(GGML_SYCL_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ:   no\n", __func__);
+#endif
+
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
+#endif
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                                dpct::dev_mgr::instance().device_count()) !=
+            0) {
+            initialized = true;
+            return;
+        }
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
+        initialized = true;
+    }
+
+    static ggml_sycl_device_info info = {};
+    info.refresh_device();
+
+    if (info.device_count == 0) {
+        fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": no available device found\n",
+                __func__);
+        return info;
+    }
+    GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
+
+    return info;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+ggml_sycl_device_info &ggml_sycl_info() {
+    static ggml_sycl_device_info info = ggml_sycl_init();
+    return info;
+}
+
+//--sycl_device_mgr--
+
+sycl_device_mgr::sycl_device_mgr(
+    ggml_sycl_backend_device_filter device_filter) {
+    switch (device_filter) {
+    case SYCL_DEVICES_TOP_LEVEL_ZERO:
+        detect_sycl_gpu_list_with_max_cu();
+        create_context_for_group_gpus();
+        break;
+    case SYCL_ALL_DEVICES:
+        detect_all_sycl_device_list();
+        create_context_for_devices();
+        break;
+    case SYCL_VISIBLE_DEVICES:
+        detect_sycl_visible_device_list();
+        create_context_for_devices();
+        break;
+    default:
+        std::cerr << "sycl_device_mgr: Invalid device_filter " << device_filter
+                  << std::endl;
+    }
+    init_allow_devices();
+}
+
+/*
+Bind all gpus in same host with same context, for better performance in
+device-to-device copy in the future.
+*/
+void sycl_device_mgr::create_context_for_group_gpus() {
+    sycl::context ctx = sycl::context(devices);
+    assert(device_ids.size() > 0);
+    first_queue = _create_queue_ptr(devices[0]);
+    sycl::context ctx0 = first_queue->get_context();
+    for (int i = 0; i < device_ids.size(); i++) {
+        ctxs.push_back(ctx0);
+    }
+}
+
+sycl::queue *sycl_device_mgr::_create_queue_ptr(sycl::device device) {
+    auto q = dpct::get_current_device().create_queue(device);
+    return q;
+    // _queues.push_back(q);
+    // return & _queues.back();
+}
+
+sycl::queue *sycl_device_mgr::create_queue_for_device(sycl::device &device) {
+    dpct::select_device(dpct::dev_mgr::instance().get_device_id(device));
+    auto qptr = _create_queue_ptr(device);
+    return qptr;
+}
+
+sycl::queue *sycl_device_mgr::create_queue_for_device_id(int device_id) {
+    int i = get_device_index(device_id);
+    sycl::device device = dpct::dev_mgr::instance().get_device(device_id);
+    return create_queue_for_device(device);
+}
+
+int sycl_device_mgr::get_device_index(int device_id) {
+    for (int i = 0; i < device_ids.size(); i++) {
+        if (device_ids[i] == device_id)
+            return i;
+    }
+    return -1;
+}
+
+void sycl_device_mgr::create_context_for_devices() {
+    for (int i = 0; i < device_ids.size(); i++) {
+        sycl::context ctx = sycl::context(devices[i]);
+        ctxs.push_back(ctx);
+    }
+}
+
+void sycl_device_mgr::init_allow_devices() {
+    device_list = "";
+    for (size_t i = 0; i < device_ids.size(); ++i) {
+        device_list += std::to_string(device_ids[i]);
+        device_list += ",";
+    }
+    if (device_list.length() > 1) {
+        device_list.pop_back();
+    }
+}
+
+bool sycl_device_mgr::is_allowed_device(int device_id) {
+    return std::find(device_ids.begin(), device_ids.end(), device_id) !=
+           device_ids.end();
+}
+
+void sycl_device_mgr::detect_all_sycl_device_list() try {
+    int device_count = dpct::dev_mgr::instance().device_count();
+
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        device_ids.push_back(id);
+        devices.push_back(device);
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        work_group_sizes.push_back(prop.get_max_work_group_size());
+        max_compute_units.push_back(prop.get_max_compute_units());
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+void sycl_device_mgr::detect_sycl_visible_device_list() try {
+    std::vector<int> sycl_devices = get_sycl_visible_devices();
+    int device_count = dpct::dev_mgr::instance().device_count();
+
+    for (int i = 0; i < sycl_devices.size(); i++) {
+        int id = sycl_devices[i];
+        if (id >= device_count) {
+            std::cerr << __func__ << ": invalid device_id:" << id
+                      << " from GGML_SYCL_VISIBLE_DEVICES="
+                      << getenv("GGML_SYCL_VISIBLE_DEVICES")
+                      << ", available IDs: ";
+            if (device_count > 1) {
+                std::cerr << "[0, " << device_count - 1 << "]";
+            } else if (device_count == 1) {
+                std::cerr << "[0]";
+            } else {
+                std::cerr << "[]";
+            }
+            std::cerr << std::endl;
+        }
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        device_ids.push_back(id);
+        devices.push_back(device);
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        work_group_sizes.push_back(prop.get_max_work_group_size());
+        max_compute_units.push_back(prop.get_max_compute_units());
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+/*
+Use all GPUs with same top max compute units
+*/
+void sycl_device_mgr::detect_sycl_gpu_list_with_max_cu() try {
+    int device_count = dpct::dev_mgr::instance().device_count();
+    int local_max_compute_units = 0;
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units < prop.get_max_compute_units())
+            local_max_compute_units = prop.get_max_compute_units();
+    }
+
+    for (int id = 0; id < device_count; id++) {
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu())
+            continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if (local_max_compute_units == prop.get_max_compute_units() &&
+            is_ext_oneapi_device(device)) {
+            device_ids.push_back(id);
+            devices.push_back(device);
+            work_group_sizes.push_back(prop.get_max_work_group_size());
+            max_compute_units.push_back(prop.get_max_compute_units());
+        }
+    }
+    return;
+} catch (sycl::exception const &exc) {
+    std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+              << ", line:" << __LINE__ << std::endl;
+    std::exit(1);
+}
+
+int sycl_device_mgr::get_device_count() { return (int)device_ids.size(); }
+
+bool sycl_device_mgr::is_ext_oneapi_device(const sycl::device &dev) {
+    sycl::backend dev_backend = dev.get_backend();
+    if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+        dev_backend == sycl::backend::ext_oneapi_cuda ||
+        dev_backend == sycl::backend::ext_oneapi_hip)
+        return true;
+    return false;
+}
+//--sycl_device_mgr--
+
+//--ggml_sycl_device_info--
+void ggml_sycl_device_info::print_gpu_device_list() {
+    GGML_ASSERT(device_mgr);
+
+    char *hint = NULL;
+    if (oneapi_device_selector_existed && sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s and "
+               "GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else if (oneapi_device_selector_existed) {
+        hint = "detect %d SYCL devices:[%s] by ONEAPI_DEVICE_SELECTOR=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("ONEAPI_DEVICE_SELECTOR"));
+    } else if (sycl_visible_devices_existed) {
+        hint = "detect %d SYCL devices:[%s] by GGML_SYCL_VISIBLE_DEVICES=%s\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                getenv("GGML_SYCL_VISIBLE_DEVICES"));
+    } else {
+        hint = "detect %d SYCL level-zero GPUs:[%s] with top Max compute "
+               "units:%d, to use any SYCL devices, set/export "
+               "GGML_SYCL_VISIBLE_DEVICES or ONEAPI_DEVICE_SELECTOR\n";
+        fprintf(stderr, hint, device_mgr->get_device_count(), devices_list(),
+                device_mgr->max_compute_units[0]);
+    }
+}
+
+int ggml_sycl_device_info::work_group_size(int device_id) {
+    GGML_ASSERT(device_mgr);
+    return device_mgr->work_group_sizes[device_id];
+}
+
+void ggml_sycl_device_info::refresh_device() {
+    oneapi_device_selector_existed = env_existed("ONEAPI_DEVICE_SELECTOR");
+    sycl_visible_devices_existed = env_existed("GGML_SYCL_VISIBLE_DEVICES");
+    if (!device_mgr)
+        delete device_mgr;
+
+    if (sycl_visible_devices_existed) {
+        device_mgr = new sycl_device_mgr(SYCL_VISIBLE_DEVICES);
+    } else if (oneapi_device_selector_existed) {
+        device_mgr = new sycl_device_mgr(SYCL_ALL_DEVICES);
+    } else {
+        device_mgr = new sycl_device_mgr(SYCL_DEVICES_TOP_LEVEL_ZERO);
+    }
+
+    device_count = device_mgr->get_device_count();
+
+    int64_t total_vram = 0;
+
+    for (int i = 0; i < device_count; ++i) {
+        int id = get_device_id(i);
+        devices[id].vmm = 0;
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id))));
+
+        default_tensor_split[i] =
+            total_vram; // continue data, so use device index
+        total_vram += prop.get_global_mem_size();
+
+        devices[id].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+    }
+
+    for (int i = 0; i < device_count; ++i) {
+        default_tensor_split[i] /=
+            total_vram; // continue data, so use device index
+    }
+
+    print_gpu_device_list();
+}
+
+bool ggml_sycl_device_info::is_allowed_device(int device_id) {
+    return device_mgr->is_allowed_device(device_id);
+}
+
+const char *ggml_sycl_device_info::devices_list() {
+    return device_mgr->device_list.c_str();
+}
+
+int ggml_sycl_device_info::get_device_id(int device_index) {
+    if (device_index < device_mgr->device_ids.size()) {
+        return device_mgr->device_ids.at(device_index);
+    } else {
+        std::cerr << __func__ << ":SYCL device:" << device_index
+                  << " is out of range:[" << devices_list() << "]" << std::endl;
+        std::exit(1);
+    }
+}
+
+//--ggml_sycl_device_info--
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 518b4e4c117bf..81652047d9f0b 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -15,6 +15,7 @@
 
 #include <fstream>
 #include <iostream>
+#include <regex>
 
 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
@@ -47,11 +48,6 @@ static int g_ggml_sycl_debug = 0;
     }                                                                    \
   }()
 
-// #define DEBUG_SYCL_MALLOC
-
-static int g_work_group_size = -1;
-// typedef sycl::half ggml_fp16_t;
-
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
 #define VER_4VEC 610 // todo for hardward optimize.
 #define VER_GEN9 700 // todo for hardward optimize.
@@ -90,8 +86,9 @@ enum ggml_sycl_backend_gpu_mode {
 };
 
 enum ggml_sycl_backend_device_filter {
-  SYCL_DEVICE_FILTER_ALL = 0,
-  SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO
+  SYCL_ALL_DEVICES = 0,
+  SYCL_DEVICES_TOP_LEVEL_ZERO,
+  SYCL_VISIBLE_DEVICES
 };
 
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
@@ -112,6 +109,8 @@ static void crash() {
   GGML_ASSERT(!"SYCL error");
 }
 
+#define SYCL_RETURN_ERROR 1
+
 #define SYCL_CHECK(err)                     \
   do {                                      \
     auto err_ = (err);                      \
@@ -124,6 +123,7 @@ static void crash() {
           "Meet error in this line code!"); \
   } while (0)
 
+
 #if DPCT_COMPAT_RT_VERSION >= 11100
 #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
 #else
@@ -152,6 +152,8 @@ static void* g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;
 
+int get_current_device_id();
+
 [[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
   stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
                 "current GPU architecture.\n";
@@ -161,8 +163,6 @@ static size_t g_scratch_offset = 0;
   (void)bad_arch; // suppress unused function warning
 }
 
-int get_current_device_id();
-
 inline dpct::err0 ggml_sycl_set_device(const int device_id) try {
 
   int current_device_id;
@@ -189,210 +189,53 @@ class sycl_device_mgr {
     std::vector<int> max_compute_units;
     std::vector<int> work_group_sizes;
     sycl::queue *first_queue;
-    std::vector<sycl::queue *> queues;
+    std::vector<sycl::queue> _queues;
     std::vector<sycl::context> ctxs;
     std::string device_list = "";
 
-    sycl_device_mgr(ggml_sycl_backend_device_filter device_filter) {
-        if (device_filter == SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO) {
-            detect_sycl_gpu_list_with_max_cu();
-            create_context_for_group_gpus();
-        } else {
-            detect_all_sycl_device_list();
-            create_context_queue_for_devices();
-        }
-        get_allow_devices();
-    }
-
-    /*
-    Bind all gpus in same host with same context, for better performance in
-    device-to-device copy in the future.
-    */
-    void create_context_for_group_gpus() {
-        sycl::context ctx = sycl::context(devices);
-        assert(device_ids.size() > 0);
-        first_queue = dpct::get_current_device().create_queue(ctx, devices[0]);
-        sycl::context ctx0 = first_queue->get_context();
-        for (int i = 0; i < device_ids.size(); i++) {
-            ctxs.push_back(ctx0);
-            dpct::select_device(device_ids[i]);
-            queues.push_back(
-                dpct::get_current_device().create_queue(ctx0, devices[i]));
-        }
-    }
-
-    void create_context_queue_for_devices() {
-        for (int i = 0; i < device_ids.size(); i++) {
-            sycl::context ctx = sycl::context(devices[i]);
-            ctxs.push_back(ctx);
-            dpct::select_device(device_ids[i]);
-            queues.push_back(
-                dpct::get_current_device().create_queue(ctx, devices[i]));
-        }
-    }
-
-    void get_allow_devices() {
-        device_list = "";
-        for (size_t i = 0; i < device_ids.size(); ++i) {
-            device_list += std::to_string(device_ids[i]);
-            device_list += ",";
-        }
-        if (device_list.length() > 1) {
-            device_list.pop_back();
-        }
-    }
-
-    bool is_allowed_device(int device_id) {
-        return std::find(device_ids.begin(), device_ids.end(), device_id) != device_ids.end();
-    }
-
-    void detect_all_sycl_device_list() try {
-        int device_count = dpct::dev_mgr::instance().device_count();
-
-        for (int id = 0; id < device_count; id++) {
-            sycl::device device = dpct::dev_mgr::instance().get_device(id);
-            device_ids.push_back(id);
-            devices.push_back(device);
-            dpct::device_info prop;
-            dpct::get_device_info(prop, device);
-            work_group_sizes.push_back(prop.get_max_work_group_size());
-            max_compute_units.push_back(prop.get_max_compute_units());
-        }
-        return;
-    } catch (sycl::exception const &exc) {
-        std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-                  << ", line:" << __LINE__ << std::endl;
-        std::exit(1);
-    }
-
-    /*
-    Use all GPUs with same top max compute units
-    */
-    void detect_sycl_gpu_list_with_max_cu() try {
-        int device_count = dpct::dev_mgr::instance().device_count();
-        int local_max_compute_units = 0;
-        for (int id = 0; id < device_count; id++) {
-            sycl::device device = dpct::dev_mgr::instance().get_device(id);
-            if (!device.is_gpu())
-                continue;
-            dpct::device_info prop;
-            dpct::get_device_info(prop, device);
-            if (local_max_compute_units < prop.get_max_compute_units())
-                local_max_compute_units = prop.get_max_compute_units();
-        }
-
-        for (int id = 0; id < device_count; id++) {
-            sycl::device device = dpct::dev_mgr::instance().get_device(id);
-            if (!device.is_gpu())
-                continue;
-            dpct::device_info prop;
-            dpct::get_device_info(prop, device);
-            if (local_max_compute_units == prop.get_max_compute_units() &&
-                is_ext_oneapi_device(device)) {
-                device_ids.push_back(id);
-                devices.push_back(device);
-                work_group_sizes.push_back(prop.get_max_work_group_size());
-                max_compute_units.push_back(prop.get_max_compute_units());
-            }
-        }
-        return;
-    } catch (sycl::exception const &exc) {
-        std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-                  << ", line:" << __LINE__ << std::endl;
-        std::exit(1);
-    }
-
-    int get_device_count() { return (int)device_ids.size(); }
-
-    bool is_ext_oneapi_device(const sycl::device &dev) {
-        sycl::backend dev_backend = dev.get_backend();
-        if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
-            dev_backend == sycl::backend::ext_oneapi_cuda ||
-            dev_backend == sycl::backend::ext_oneapi_hip)
-            return true;
-        return false;
-    }
+    sycl_device_mgr(ggml_sycl_backend_device_filter device_filter);
+
+    sycl::queue *_create_queue_ptr(sycl::device device); //internal API to hide dpct API.
+    void create_context_for_group_gpus();
+    sycl::queue *create_queue_for_device(sycl::device &device);
+    sycl::queue *create_queue_for_device_id(int device_id);
+    int get_device_index(int device_id);
+    void create_context_for_devices();
+    void init_allow_devices();
+    bool is_allowed_device(int device_id);
+    void detect_all_sycl_device_list();
+    void detect_sycl_visible_device_list();
+    void detect_sycl_gpu_list_with_max_cu();
+    int get_device_count();
+    bool is_ext_oneapi_device(const sycl::device &dev);
 };
 
+
 struct ggml_sycl_device_info {
     int device_count;
-    int main_gpu_id = -1;
-    ggml_sycl_backend_gpu_mode use_gpu_mode = SYCL_MUL_GPU_MODE;
+    bool oneapi_device_selector_existed = false;
+    bool sycl_visible_devices_existed = false;
+
     struct sycl_device_info {
-        int cc; // compute capability
+        int     cc;                 // compute capability
         // int     nsm;                // number of streaming multiprocessors
         // size_t  smpb;               // max. shared memory per block
-        bool vmm; // virtual memory support
-        size_t total_vram;
+        bool    vmm;                // virtual memory support
+        size_t  total_vram;
     };
 
     sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
 
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
-    sycl_device_mgr *local_sycl_device_mgr = NULL;
-
-    void print_gpu_device_list() {
-        GGML_ASSERT(local_sycl_device_mgr);
-
-        char *hint = NULL;
-        if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
-            hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
-            fprintf(stderr, hint, local_sycl_device_mgr->get_device_count(),
-                    local_sycl_device_mgr->device_list.c_str(),
-                    local_sycl_device_mgr->max_compute_units[main_gpu_id]);
-        } else {
-            hint = "use main device [%d] with Max compute units:%d\n";
-            fprintf(stderr, hint, main_gpu_id,
-                    local_sycl_device_mgr->max_compute_units[main_gpu_id]);
-        }
-    }
-
-    int work_group_size(int device_id) {
-        GGML_ASSERT(local_sycl_device_mgr);
-        return local_sycl_device_mgr->work_group_sizes[device_id];
-    }
-
-    void refresh_device(ggml_sycl_backend_gpu_mode gpu_model,
-                        int p_main_gpu_id = 0) {
-        main_gpu_id = p_main_gpu_id;
-        use_gpu_mode = gpu_model;
-        if (!local_sycl_device_mgr)
-            delete local_sycl_device_mgr;
-
-        if (use_gpu_mode == SYCL_MUL_GPU_MODE) {
-            local_sycl_device_mgr =
-                new sycl_device_mgr(SYCL_DEVICE_FILTER_TOP_LEVEL_ZERO);
-        } else {
-            GGML_ASSERT(main_gpu_id >= 0);
-            local_sycl_device_mgr = new sycl_device_mgr(SYCL_DEVICE_FILTER_ALL);
-        }
-
-        device_count = local_sycl_device_mgr->get_device_count();
-
-        int64_t total_vram = 0;
-
-        for (int i = 0; i < device_count; ++i) {
-            devices[i].vmm = 0;
-            dpct::device_info prop;
-            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(i))));
-
-            default_tensor_split[i] = total_vram;
-            total_vram += prop.get_global_mem_size();
-
-            devices[i].cc =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-        }
-
-        for (int id = 0; id < device_count; ++id) {
-            default_tensor_split[id] /= total_vram;
-        }
-
-        g_work_group_size = work_group_size(main_gpu_id);
-        print_gpu_device_list();
-    }
+    sycl_device_mgr *device_mgr = NULL;
 
+    void print_gpu_device_list();
+    int work_group_size(int device_id);
+    void refresh_device();
+    bool is_allowed_device(int device_id);
+    const char* devices_list();
+    int get_device_id(int device_index);
 };
 
 struct ggml_sycl_pool {
@@ -460,15 +303,17 @@ struct ggml_backend_sycl_context {
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
-    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device) :
-        device(device),
+    explicit ggml_backend_sycl_context(struct ggml_sycl_device_info &sycl_device_info, int device_id) :
+        device(device_id),
         name(GGML_SYCL_NAME + std::to_string(device)) {
-            qptrs[device][0] = sycl_device_info.local_sycl_device_mgr->queues[device];
+            for (int i=0;i<GGML_SYCL_MAX_STREAMS; i++){
+                qptrs[device_id][i] = sycl_device_info.device_mgr->create_queue_for_device_id(device_id);
+            }
     }
 
     queue_ptr stream(int device, int stream) {
-        assert(qptrs[device][0] != nullptr);
-        return qptrs[device][0];
+        assert(qptrs[device][stream] != nullptr);
+        return qptrs[device][stream];
     }
 
     queue_ptr stream() {
@@ -492,6 +337,20 @@ struct ggml_backend_sycl_context {
     }
 };
 
+static inline void exit_with_stack_print() {
+    SYCL_CHECK(SYCL_RETURN_ERROR);
+}
+
+
+static inline int get_sycl_env(const char *env_name, int default_val);
+static inline bool env_existed(const char *env_name);
+void* ggml_sycl_host_malloc(size_t size);
+void ggml_sycl_host_free(void* ptr);
+static std::vector<int> get_sycl_visible_devices();
+void ggml_backend_sycl_print_sycl_devices();
+static ggml_sycl_device_info ggml_sycl_init();
+ggml_sycl_device_info &ggml_sycl_info();
+
 // common host functions
 
 static inline int get_work_group_size(const sycl::device& device) {
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 1ff297218c685..a7c020dcafd22 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -588,7 +588,7 @@ namespace dpct
         out = prop;
     }
 
-   /// dpct device extension
+    /// dpct device extension
     class device_ext : public sycl::device {
       typedef std::mutex mutex_type;
 
@@ -687,119 +687,131 @@ namespace dpct
         init_queues();
       }
 
-      sycl::queue &in_order_queue() { return _q_in_order; }
+      sycl::queue &in_order_queue() { return *_q_in_order; }
 
-      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+      sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
 
       sycl::queue &default_queue() { return in_order_queue(); }
 
       void queues_wait_and_throw() {
         std::unique_lock<mutex_type> lock(m_mutex);
+        std::vector<std::shared_ptr<sycl::queue>> current_queues(
+            _queues);
         lock.unlock();
-        for (auto &q : _queues) {
-          q.wait_and_throw();
+        for (const auto &q : current_queues)
+        {
+            q->wait_and_throw();
         }
         // Guard the destruct of current_queues to make sure the ref count is
         // safe.
         lock.lock();
       }
 
-      sycl::queue create_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_queue(bool enable_exception_handler = false) {
         return create_in_order_queue(enable_exception_handler);
       }
 
-      sycl::queue create_queue(sycl::device device,
+      sycl::queue *create_queue(sycl::device device,
                                bool enable_exception_handler = false) {
         return create_in_order_queue(device, enable_exception_handler);
       }
 
-      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue create_in_order_queue(sycl::device device,
+      sycl::queue *create_in_order_queue(sycl::device device,
                                         bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(device, enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue create_out_of_order_queue(
+      sycl::queue *create_out_of_order_queue(
           bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler);
       }
 
-      void destroy_queue(sycl::queue queue) {
+      void destroy_queue(sycl::queue *&queue) {
         std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.clear();
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                    {
+                                        return q.get() == queue;
+                                    }),
+                    _queues.end());
+        queue = nullptr;
       }
-      void set_saved_queue(sycl::queue q) {
+      void set_saved_queue(sycl::queue *q) {
         std::lock_guard<mutex_type> lock(m_mutex);
         _saved_queue = q;
       }
-      sycl::queue get_saved_queue() const {
+      sycl::queue *get_saved_queue() const {
         std::lock_guard<mutex_type> lock(m_mutex);
         return _saved_queue;
       }
 
      private:
-      void clear_queues() { _queues.clear(); }
+      void clear_queues() {
+        _queues.clear();
+        _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+      }
 
       void init_queues() {
         _q_in_order =
             create_queue_impl(true, sycl::property::queue::in_order());
         _q_out_of_order = create_queue_impl(true);
-        _saved_queue = default_queue();
+        _saved_queue = &default_queue();
       }
 
       /// Caller should acquire resource \p m_mutex before calling this
       /// function.
       template <class... Properties>
-      sycl::queue create_queue_impl(bool enable_exception_handler,
+      sycl::queue *create_queue_impl(bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        auto q = sycl::queue(*this, eh,
-                             sycl::property_list(
+        _queues.push_back(std::make_shared<sycl::queue>(
+            *this, eh,
+            sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                                 sycl::property::queue::enable_profiling(),
+                sycl::property::queue::enable_profiling(),
 #endif
-                                 properties...));
-        _queues.push_back(q);
+                properties...)));
 
-        return _queues.back();
+        return _queues.back().get();
       }
 
       template <class... Properties>
-      sycl::queue create_queue_impl(sycl::device device,
+      sycl::queue *create_queue_impl(sycl::device device,
                                     bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(
-            sycl::queue(device, eh,
+        _queues.push_back(std::make_shared<sycl::queue>(
+            device, eh,
                         sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                             sycl::property::queue::enable_profiling(),
 #endif
                             properties...)));
 
-        return _queues.back();
+        return _queues.back().get();
       }
 
       void get_version(int &major, int &minor) const {
         detail::get_version(*this, major, minor);
       }
-      sycl::queue _q_in_order, _q_out_of_order;
-      sycl::queue _saved_queue;
-      std::vector<sycl::queue> _queues;
+      sycl::queue *_q_in_order, *_q_out_of_order;
+      sycl::queue *_saved_queue;
+      std::vector<std::shared_ptr<sycl::queue>> _queues;
       mutable mutex_type m_mutex;
     };
 
@@ -855,15 +867,15 @@ namespace dpct
         unsigned int get_device_id(const sycl::device &dev)
         {
             unsigned int id = 0;
-            for (auto dev_item : _devs)
+            for (auto &dev_item : _devs)
             {
                 if (*dev_item == dev)
                 {
-                    break;
+                    return id;
                 }
                 id++;
             }
-            return id;
+            return -1;
         }
 
         template <class DeviceSelector>
diff --git a/src/llama.cpp b/src/llama.cpp
index 98a386e72c38d..1a7d24ccb3ece 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2704,7 +2704,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 #elif defined(GGML_USE_VULKAN)
     buft = ggml_backend_vk_buffer_type(gpu);
 #elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(gpu);
+    int gpu_id = ggml_backend_sycl_get_device_id(gpu);
+    buft = ggml_backend_sycl_buffer_type(gpu_id);
 #elif defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(gpu);
     if (buft == nullptr) {
@@ -17619,7 +17620,6 @@ struct llama_context * llama_new_context_with_model(
 #elif defined(GGML_USE_SYCL)
         // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            ggml_sycl_set_single_device(model->main_gpu);
             ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
@@ -17630,11 +17630,10 @@ struct llama_context * llama_new_context_with_model(
         } else {
             // LLAMA_SPLIT_LAYER requires a backend for each GPU
             for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                ggml_backend_t backend = ggml_backend_sycl_init(i);
+                int id = ggml_backend_sycl_get_device_id(i);
+                ggml_backend_t backend = ggml_backend_sycl_init(id);
                 if (backend == nullptr) {
-                    int id_list[GGML_SYCL_MAX_DEVICES];
-                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, id, i);
                     llama_free(ctx);
                     return nullptr;
                 }