openvinotoolkit · ilya-lavrenov · Feb 26, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 19, 2025
diff --git a/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp b/samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp
@@ -18,7 +18,6 @@ int32_t main(int32_t argc, char* argv[]) try {
 
     const int width = 512;
     const int height = 512;
-    const float guidance_scale = 7.5f;
     const int number_of_images_to_generate = 1;
     const int number_of_inference_steps_per_image = 20;
 
@@ -37,73 +36,36 @@ int32_t main(int32_t argc, char* argv[]) try {
     std::string ov_cache_dir = "./cache";
 
     //
-    // Step 1: Prepare each Text2Image subcomponent (scheduler, text encoder, unet, vae) separately.
+    // Step 1: Create the initial Text2ImagePipeline, given the model path
     //
+    ov::genai::Text2ImagePipeline pipe(models_path);
 
-    // Create the scheduler from the details listed in the json.
-    auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json");
-
-    // Note that we could have created the scheduler by specifying specific type (for example EULER_DISCRETE), like
-    // this: auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json",
-    //                                                    ov::genai::Scheduler::Type::EULER_DISCRETE);
-    // This can be useful when a particular type of Scheduler is not yet supported natively by OpenVINO GenAI.
-    // (even though we are actively working to support most commonly used ones)
-
-    // Create unet object
-    auto unet = ov::genai::UNet2DConditionModel(root_dir / "unet");
-
-    // Set batch size based on classifier free guidance condition.
-    int unet_batch_size = unet.do_classifier_free_guidance(guidance_scale)  ? 2 : 1;
-
-    // Create the text encoder.
-    auto text_encoder = ov::genai::CLIPTextModel(root_dir / "text_encoder");
-
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (text_encoder_device == "NPU") {
-        text_encoder.reshape(unet_batch_size);
-    }
-
-    // Compile text encoder for the specified device
-    text_encoder.compile(text_encoder_device, ov::cache_dir(ov_cache_dir));
-
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (unet_device == "NPU") {
-        // The max_postiion_embeddings config from text encoder will be used as a parameter to unet reshape.
-        int max_position_embeddings = text_encoder.get_config().max_position_embeddings;
-
-        unet.reshape(unet_batch_size, height, width, max_position_embeddings);
-    }
-
-    // Compile unet for specified device
-    unet.compile(unet_device, ov::cache_dir(ov_cache_dir));
+    //
+    // Step 2: Reshape the pipeline given number of images, width, height, and guidance scale.
+    //
+    pipe.reshape(1, width, height, pipe.get_generation_config().guidance_scale);
 
-    // Create the vae decoder.
-    auto vae = ov::genai::AutoencoderKL(root_dir / "vae_decoder");
+    //
+    // Step 3: Compile the pipeline with the specified devices, and properties (like cache dir)
+    //
+    ov::AnyMap properties = {ov::cache_dir(ov_cache_dir)};
 
-    // In case of NPU, we need to reshape the model to have static shapes
-    if (vae_decoder_device == "NPU") {
-        // We set batch-size to '1' here, as we're configuring our pipeline to return 1 image per 'generate' call.
-        vae.reshape(1, height, width);
-    }
+    // Note that if there are device-specific properties that are needed, they can
+    // be added using ov::device::properties groups, like this:
+    //ov::AnyMap properties = {ov::device::properties("CPU", ov::cache_dir("cpu_cache")),
+    //                         ov::device::properties("GPU", ov::cache_dir("gpu_cache")),
+    //                         ov::device::properties("NPU", ov::cache_dir("npu_cache"))};
 
-    // Compile vae decoder for the specified device
-    vae.compile(vae_decoder_device, ov::cache_dir(ov_cache_dir));
+    pipe.compile(text_encoder_device, unet_device, vae_decoder_device, properties);
 
-    //
-    // Step 2: Create a Text2ImagePipeline from the individual subcomponents
-    //
-    auto pipe = ov::genai::Text2ImagePipeline::stable_diffusion(scheduler, text_encoder, unet, vae);
 
     //
-    // Step 3: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
+    // Step 4: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
     //
     for (int imagei = 0; imagei < number_of_images_to_generate; imagei++) {
         std::cout << "Generating image " << imagei << std::endl;
 
         ov::Tensor image = pipe.generate(prompt,
-                                         ov::genai::width(width),
-                                         ov::genai::height(height),
-                                         ov::genai::guidance_scale(guidance_scale),
                                          ov::genai::num_inference_steps(number_of_inference_steps_per_image),
                                          ov::genai::callback(progress_bar));
 

diff --git a/samples/python/image_generation/heterogeneous_stable_diffusion.py b/samples/python/image_generation/heterogeneous_stable_diffusion.py
@@ -23,7 +23,6 @@ def main():
 
     width = 512
     height = 512
-    guidance_scale = 7.5
     number_of_images_to_generate = 1
     number_of_inference_steps_per_image = 20
 
@@ -36,72 +35,41 @@ def main():
     ov_cache_dir = "./cache"
 
     #
-    # Step 1: Prepare each Text2Image subcomponent (scheduler, text encoder, unet, vae) separately.
+    # Step 1: Create the initial Text2ImagePipeline, given the model path
     #
+    pipe = openvino_genai.Text2ImagePipeline(args.model_dir)
 
-    # Create the scheduler from the details listed in the json.
-    scheduler = openvino_genai.Scheduler.from_config(args.model_dir + "/scheduler/scheduler_config.json")
-
-    # Note that we can also create the scheduler by specifying specific type (for example EULER_DISCRETE), like this:
-    # scheduler = openvino_genai.Scheduler.from_config(args.model_dir + "/scheduler/scheduler_config.json",
-    #                                                  openvino_genai.Scheduler.Type.EULER_DISCRETE)
-    # This can be useful when a particular type of Scheduler is not yet supported natively by OpenVINO GenAI.
-    # (even though we are actively working to support most commonly used ones)
-
-    # Create unet object
-    unet = openvino_genai.UNet2DConditionModel(args.model_dir + "/unet")
-
-    # Set batch size based on classifier free guidance condition.
-    unet_batch_size = 2 if unet.do_classifier_free_guidance(guidance_scale) else 1
-
-    # Create the text encoder
-    text_encoder = openvino_genai.CLIPTextModel(args.model_dir + "/text_encoder")
-
-    # In case of NPU, we need to reshape the model to have static shapes
-    if args.text_encoder_device == "NPU":
-        text_encoder.reshape(unet_batch_size)
-
-    # Compile text encoder for the specified device
-    text_encoder.compile(args.text_encoder_device, CACHE_DIR=ov_cache_dir)
-
-    # In case of NPU, we need to reshape the unet model to have static shapes
-    if args.unet_device == "NPU":
-        # The max_postion_embeddings config from text encoder will be used as a parameter to unet reshape.
-        max_position_embeddings = text_encoder.get_config().max_position_embeddings
-
-        unet.reshape(unet_batch_size, height, width, max_position_embeddings)
-
-    # Compile unet for specified device
-    unet.compile(args.unet_device, CACHE_DIR=ov_cache_dir)
-
-    # Create the decoder
-    vae = openvino_genai.AutoencoderKL(args.model_dir + "/vae_decoder")
-
-    # In case of NPU, we need to reshape the vae model to have static shapes
-    if args.vae_decoder_device == "NPU":
-        vae.reshape(1, height, width)
-
-    # Compile vae decoder for the specified device
-    vae.compile(args.vae_decoder_device, CACHE_DIR=ov_cache_dir)
+    #
+    # Step 2: Reshape the pipeline given number of images, width, height, and guidance scale.
+    #
+    pipe.reshape(1, width, height, pipe.get_generation_config().guidance_scale);
 
     #
-    # Step 2: Create a Text2ImagePipeline from the individual subcomponents
+    # Step 3: Compile the pipeline given the specified devices, and properties (like cache dir)
     #
+    properties = {"CACHE_DIR": "cache"}
+
+    # Note that if there are device-specific properties that are needed, they can
+    # be added using a "DEVICE_PROPERTIES" entry, like this:
+    #properties = {
+    #    "DEVICE_PROPERTIES":
+    #    {
+    #        "CPU": {"CACHE_DIR": "cpu_cache"},
+    #        "GPU": {"CACHE_DIR": "gpu_cache"},
+    #        "NPU": {"CACHE_DIR": "npu_cache"}
+    #    }
+    #}
 
-    pipe = openvino_genai.Text2ImagePipeline.stable_diffusion(scheduler, text_encoder, unet, vae)
+    pipe.compile(args.text_encoder_device, args.unet_device, args.vae_decoder_device, config=properties );
 
     #
-    # Step 3: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
+    # Step 4: Use the Text2ImagePipeline to generate 'number_of_images_to_generate' images.
     #
 
     for imagei in range(0, number_of_images_to_generate):
         image_tensor = pipe.generate(
             args.prompt,
-            width=width,
-            height=height,
-            guidance_scale=guidance_scale,
             num_inference_steps=number_of_inference_steps_per_image,
-            num_images_per_prompt=1
         )
 
         image = Image.fromarray(image_tensor.data[0])

diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp
@@ -192,13 +192,36 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline {
      */
     void compile(const std::string& device, const ov::AnyMap& properties = {});
 
+    /**
+     * Compiles image generation pipeline for given devices for text encoding, denoising, and vae decoding.
+     * @param text_encode_device A device to compile text encoder(s) with
+     * @param denoise_device A device to compile denoiser (e.g. UNet, SD3 Transformer, etc.) with
+     * @param vae_decode_device A device to compile VAE decoder(s) with
+     * @param properties A map of properties which affect models compilation
+     * @note If pipeline was compiled before, an exception is thrown.
+     */
+    void compile(const std::string& text_encode_device,
+                 const std::string& denoise_device,
+                 const std::string& vae_decode_device,
+                 const ov::AnyMap& properties = {});
+
     template <typename... Properties>
     ov::util::EnableIfAllStringAny<void, Properties...> compile(
             const std::string& device,
             Properties&&... properties) {
         return compile(device, ov::AnyMap{std::forward<Properties>(properties)...});
     }
 
+    template <typename... Properties>
+    ov::util::EnableIfAllStringAny<void, Properties...> compile(const std::string& text_encode_device,
+                                                                const std::string& denoise_device,
+                                                                const std::string& vae_decode_device,
+                                                                Properties&&... properties) {
+        return compile(text_encode_device,
+                       denoise_device,
+                       vae_decode_device, ov::AnyMap{std::forward<Properties>(properties)...});
+    }
+
     /**
      * Generates image(s) based on prompt and other image generation parameters
      * @param positive_prompt Prompt to generate image(s) from

diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp
@@ -100,7 +100,15 @@ class DiffusionPipeline {
 
     virtual void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) = 0;
 
-    virtual void compile(const std::string& device, const ov::AnyMap& properties) = 0;
+    virtual void compile(const std::string& device, const ov::AnyMap& properties)
+    {
+        compile(device, device, device, properties);
+    }
+
+    virtual void compile(const std::string& text_encode_device,
+                         const std::string& denoise_device,
+                         const std::string& vae_decode_device,
+                         const ov::AnyMap& properties) = 0;
 
     virtual std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) = 0;
 

diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -256,6 +256,13 @@ class FluxPipeline : public DiffusionPipeline {
         m_transformer->compile(device, *updated_properties);
     }
 
+    void compile(const std::string& text_encode_device,
+                 const std::string& denoise_device,
+                 const std::string& vae_decode_device,
+                 const ov::AnyMap& properties) override {
+        OPENVINO_THROW("not supported yet.");
+    }
+
     void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override {
         // encode_prompt
         std::string prompt_2_str = generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt;

diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
@@ -255,6 +255,13 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
         m_vae->compile(device, properties);
     }
 
+    void compile(const std::string& text_encode_device,
+                 const std::string& denoise_device,
+                 const std::string& vae_decode_device,
+                 const ov::AnyMap& properties) override {
+        OPENVINO_THROW("not supported yet.");
+    }
+
     void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override {
         const auto& transformer_config = m_transformer->get_config();
         const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1;  // Transformer accepts 2x batch in case of CFG

diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp
@@ -151,13 +151,16 @@ class StableDiffusionPipeline : public DiffusionPipeline {
         m_vae->reshape(num_images_per_prompt, height, width);
     }
 
-    void compile(const std::string& device, const ov::AnyMap& properties) override {
+    void compile(const std::string& text_encode_device,
+        const std::string& denoise_device,
+        const std::string& vae_decode_device,
+        const ov::AnyMap& properties) override {
         update_adapters_from_properties(properties, m_generation_config.adapters);
         auto updated_properties = update_adapters_in_properties(properties, &DiffusionPipeline::derived_adapters);
 
-        m_clip_text_encoder->compile(device, *updated_properties);
-        m_unet->compile(device, *updated_properties);
-        m_vae->compile(device, *updated_properties);
+        m_clip_text_encoder->compile(text_encode_device, *updated_properties);
+        m_unet->compile(denoise_device, *updated_properties);
+        m_vae->compile(vae_decode_device, *updated_properties);
     }
 
     void compute_hidden_states(const std::string& positive_prompt, const ImageGenerationConfig& generation_config) override {