Handle latencies of more than two frames (#9)

deltacasttv · Dec 22, 2023 · 2d4ae23 · 2d4ae23
1 parent 6b9e6db
commit 2d4ae23
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# Unreleased
+
+## Added
+
+- Parameter for handling maximum desired input to output latency (default is 2)
+
 # 1.0.1
 
 ## Fixed

diff --git a/README.md b/README.md
@@ -72,6 +72,7 @@ Activating the rendering of the live content on the screen and the handling of t
 The application is designed to be easily customizable in terms of processing and memory allocation of the buffers.
 
 `processing.cpp` contains code for overlay and non-overlay processing which can be modified to implement any kind of processing.
+Pay extra care that the processing time shall be less than the time between two frames, otherwise the application will not be able to keep up with the incoming frames and will constantly drop content.
 
 `allocation.cpp` contains code for buffer allocation which can be modified to implement any kind of buffer allocation, be it on the GPU or the host memory.
 

diff --git a/src/main.cpp b/src/main.cpp
@@ -73,13 +73,15 @@ int main(int argc, char** argv)
     int tx_stream_id = 0;
     bool overlay_enabled = false;
     bool renderer_enabled = false;
+    shared_resources.maximum_latency = 2;
 
     CLI::App app{"Generates some content from input and sends it to output"};
     app.add_option("-d,--device", device_id, "ID of the device to use");
     app.add_option("-i,--input", rx_stream_id, "ID of the input connector to use");
     app.add_option("-o,--output", tx_stream_id, "ID of the output connector to use");
     app.add_flag("--overlay,!--no-overlay", overlay_enabled, "Activates overlay on the output stream");
     app.add_flag("--renderer,!--no-renderer", renderer_enabled, "Activates rendering of the live input stream");
+    app.add_option("-l,--maximum-latency", shared_resources.maximum_latency, "Maximum desired latency in frames between input and output");
     CLI11_PARSE(app, argc, argv);
 
     signal(SIGINT, on_close);

diff --git a/src/rx_stream.cpp b/src/rx_stream.cpp
@@ -56,7 +56,7 @@ bool Deltacast::RxStream::configure(SignalInformation signal_info, bool /*overla
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_SDI_SP_VIDEO_STANDARD, signal_info.video_standard))
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_SDI_SP_INTERFACE, signal_info.interface))
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFER_PACKING, VHD_BUFPACK_VIDEO_RGB_24))
-        || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_DEPTH, 2)))
+        || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_DEPTH, _buffer_queue_depth)))
     {
         std::cout << "ERROR for " << _name << ": Cannot configure stream (" << api_success << ")" << std::endl;
         return false;
@@ -82,15 +82,15 @@ bool Deltacast::RxStream::loop_iteration(SharedResources& shared_resources)
     do
     {
         push_slot(slot_handle);
-        VHD_GetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_FILLING, &filling);
 
         auto [ _handle, api_success ] = pop_slot();
         if (!api_success && api_success.error_code() == VHDERR_TIMEOUT)
             return true;
         else if (!api_success)
             return false;
-
         slot_handle = _handle;
+
+        VHD_GetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_FILLING, &filling);
     } while (filling > 0);
 
     // Wraps the slot handle so that it is pushed back to the queue when it goes out of scope

diff --git a/src/shared_resources.hpp b/src/shared_resources.hpp
@@ -55,6 +55,8 @@ namespace Deltacast
 
         SignalInformation signal_info;
 
+        unsigned int maximum_latency;
+
         void reset();
     };
 }
diff --git a/src/stream.hpp b/src/stream.hpp
@@ -46,7 +46,7 @@ namespace Deltacast
         int _channel_index;
         std::string _name;
 
-        static const uint32_t _buffer_queue_depth = 2;
+        static const uint32_t _buffer_queue_depth = 16;
         static const uint32_t _slot_timeout_ms = 100;
 
         using ApiPopSlot = std::function<ULONG(HANDLE stream_handle, HANDLE* slot_handle, ULONG timeout_ms)>;

diff --git a/src/tx_stream.cpp b/src/tx_stream.cpp
@@ -57,7 +57,7 @@ bool Deltacast::TxStream::configure(SignalInformation signal_info, bool overlay_
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_SDI_SP_INTERFACE, signal_info.interface))
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_SDI_SP_TX_GENLOCK, TRUE))
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFER_PACKING, (overlay_enabled ? VHD_BUFPACK_VIDEO_RGBA_32 : VHD_BUFPACK_VIDEO_RGB_24)))
-        || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_DEPTH, 2))
+        || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_DEPTH, _buffer_queue_depth))
         || !(api_success = VHD_SetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_PRELOAD, 0)))
     {
         std::cout << "ERROR for " << _name << ": Cannot configure stream (" << api_success << ")" << std::endl;
@@ -89,17 +89,21 @@ bool Deltacast::TxStream::loop_iteration(SharedResources& shared_resources)
     if (_should_stop)
         return false;
 
-    ULONG on_board_filling = 0;
-    if (!(api_success = VHD_GetStreamProperty(*handle(), VHD_CORE_SP_ONBOARDBUFFER_FILLING, &on_board_filling)))
+    ULONG on_board_filling = 0, buffer_queue_filling = 0;
+    if (!(api_success = VHD_GetStreamProperty(*handle(), VHD_CORE_SP_ONBOARDBUFFER_FILLING, &on_board_filling))
+        || !(api_success = VHD_GetStreamProperty(*handle(), VHD_CORE_SP_BUFFERQUEUE_FILLING, &buffer_queue_filling)))
     {
         std::cout << "ERROR for " << _name << ": Cannot get stream property (" << api_success << ")" << std::endl;
         return false;
     }
 
-    for (auto i = 1; i < on_board_filling; ++i)
+    if (buffer_queue_filling > (shared_resources.maximum_latency - 2))
     {
-        shared_resources.synchronization.notify_processing_finished();
-        while (!_should_stop && !shared_resources.synchronization.wait_until_ready_to_process()) {}
+        for (auto i = 0; i < buffer_queue_filling - (shared_resources.maximum_latency - 2); ++i)
+        {
+            shared_resources.synchronization.notify_processing_finished();
+            while (!_should_stop && !shared_resources.synchronization.wait_until_ready_to_process()) {}
+        }
     }
 
     UBYTE* buffer = nullptr;

diff --git a/technical_details.md b/technical_details.md
@@ -44,7 +44,7 @@ The application performs the following actions:
 ## TX
 
 - `Preload`: 0
-- `Buffer queue depth`: 2
+- `Buffer queue depth`: 16
 - RGBA 8b `buffer packing` if overlay, RGB 8b if not
 - `Genlocked`
 
@@ -75,6 +75,8 @@ That way, we can guarantee that the buffer that will be communicated to the TX t
 
 ## TX
 
+When the minimal latency of 2 is a scenario achievable by the device, we can guarantee the following.
+
 Due to the nature of the communication between the RX and TX thread, the TX buffer queue will never be filled with more than one buffer at a time.
 Indeed, since the RX thread is the one that drives the TX thread, we can never end up in a situation where the RX thread will give buffers to the TX thread faster that the rate at which they are consumed.
 
@@ -83,4 +85,26 @@ When the TX thread awakens due to some buffer being ready to be processed, it ch
 If it is greater than 1 (0 is impossible since the device is always processing a buffer) then the buffer needs to be skipped and we need to wait of the next frame, so that the device can consume the on-board buffer and go back to a queue of 1.
 Waiting for that next video frame is achieved thanks to the cadencing of the RX thread.
 The TX thread simply notifies that the processing has finished (although it actually wants to skip it) and then immediately waits again for a new buffer.
-This has the effect that the RX thread will go through one full cycle again, thus waiting for the next frame and achieving the desired cadencing.
+This has the effect that the RX thread will go through one full cycle again, thus waiting for the next frame and achieving the desired cadencing.
+
+However, when the latency is greater than 2, the device will not be able to consume the on-board buffer fast enough and the on-board buffer queue will fill up.
+It will reach a stable state where the size of the buffer queue + the two on board buffers will be equal to the latency.
+In that case, we still need a way to ensure that this minimal latency is respected.
+This is achieved by the `--maximum-latency` parameter.
+When the latency is greater than 2, the `--maximum-latency` parameter is used to determine the number of buffers that need to be skipped and it is achieved similarly to the case where the latency is 2.
+
+We recommend keeping the `--maximum-latency` parameter to 2 and then fine-tune this parameter in case the device is not capable of achieving the desired latency.
+Another way to determine the proper value for this parameter is to know in advance the exact processing time, the time to transfer the data from the device to the host memory and the time to transfer the data from the host memory to the device.
+The latency that can be achieved is of the form `1 + ((processing_time + transfer_time_host_to_device + transfer_time_device_to_host) / period_of_the_signal)`, rounded up.
+
+For instance,
+```
+processing_time = 9 ms
+transfer_time_host_to_device = 12 ms
+transfer_time_device_to_host = 14 ms
+period_of_the_signal = 16.67 ms
+```
+leads to a latency of 4 frames, since `1 + ((9 + 12 + 14) / 16.67)=3.1`, rounded up and giving `4`.
+
+Due to the nature of the synchronisation between the RX and TX threads, a latency of more than 4 frames is impossible to achieve.
+We would need to parallelize the processing of the buffers which is something currently not supported.