From 67c6cf2fc05435186a8c96601fae078b237655b4 Mon Sep 17 00:00:00 2001
From: yero <yero@google.com>
Date: Wed, 30 Oct 2024 02:11:39 -0400
Subject: [PATCH] Changes the use pattern of `AudioFrameGenerator`

- No need to call `AddSamples()` with empty frames to signal the end of
  substreams -- calling `Finalize()` does it.
- No need to call `Finalize()` multiple times to flush remaining frames in
  the encoders -- remaining frames are automatically handled by
  `OutputFrames()`.
- Calling `AddSamples()` after `Finalize()` or calling `Finalize()` multiple
  times are OK-- they will just become NOOPs.

PiperOrigin-RevId: 691295143
---
 iamf/cli/iamf_encoder.cc                      |   8 +
 iamf/cli/iamf_encoder.h                       |   4 +
 .../cli/proto_to_obu/audio_frame_generator.cc | 144 +++++++++++++-----
 iamf/cli/proto_to_obu/audio_frame_generator.h |  51 ++++---
 .../tests/audio_frame_generator_test.cc       |  78 +++++++---
 5 files changed, 211 insertions(+), 74 deletions(-)

diff --git a/iamf/cli/iamf_encoder.cc b/iamf/cli/iamf_encoder.cc
index 114a6b9..bad6d2f 100644
--- a/iamf/cli/iamf_encoder.cc
+++ b/iamf/cli/iamf_encoder.cc
@@ -174,6 +174,13 @@ absl::Status IamfEncoder::GetInputTimestamp(int32_t& input_timestamp) {
 void IamfEncoder::AddSamples(const DecodedUleb128 audio_element_id,
                              ChannelLabel::Label label,
                              const std::vector<InternalSampleType>& samples) {
+  if (add_samples_finalized_) {
+    LOG_FIRST_N(WARNING, 3)
+        << "Calling `AddSamples()` after `FinalizeAddSamples()` has no effect; "
+        << samples.size() << " input samples discarded.";
+    return;
+  }
+
   id_to_labeled_samples_[audio_element_id][label] = samples;
 }
 
@@ -213,6 +220,7 @@ absl::Status IamfEncoder::OutputTemporalUnit(
           audio_frame_generator_->AddSamples(audio_element_id, label, samples));
     }
   }
+
   if (add_samples_finalized_) {
     RETURN_IF_NOT_OK(audio_frame_generator_->Finalize());
   }
diff --git a/iamf/cli/iamf_encoder.h b/iamf/cli/iamf_encoder.h
index acc51ca..ff84127 100644
--- a/iamf/cli/iamf_encoder.h
+++ b/iamf/cli/iamf_encoder.h
@@ -120,6 +120,10 @@ class IamfEncoder {
   absl::Status GetInputTimestamp(int32_t& input_timestamp);
 
   /*!\brief Adds audio samples belonging to the same temporal unit.
+   *
+   * The best practice is to not call this function after
+   * `FinalizeAddSamples()`. But it is OK if you do -- just that the added
+   * samples will be ignored and not encoded.
    *
    * \param audio_element_id ID of the audio element to add samples to.
    * \param label Channel label to add samples to.
diff --git a/iamf/cli/proto_to_obu/audio_frame_generator.cc b/iamf/cli/proto_to_obu/audio_frame_generator.cc
index 3c53aa4..bcf8690 100644
--- a/iamf/cli/proto_to_obu/audio_frame_generator.cc
+++ b/iamf/cli/proto_to_obu/audio_frame_generator.cc
@@ -53,6 +53,7 @@
 #include "iamf/obu/codec_config.h"
 #include "iamf/obu/demixing_info_parameter_data.h"
 #include "iamf/obu/types.h"
+#include "src/google/protobuf/repeated_ptr_field.h"
 
 namespace iamf_tools {
 
@@ -233,23 +234,22 @@ absl::Status InitializeSubstreamData(
 // An audio element may contain many channels, denoted by their labels;
 // this function returns whether all labels have their (same amount of)
 // samples ready.
-bool SamplesReadyForAudioElement(
-    const LabelSamplesMap& label_to_samples,
-    const absl::flat_hash_set<ChannelLabel::Label>& channel_labels) {
-  size_t common_num_samples = 0;
-  for (const auto& label : channel_labels) {
+bool SamplesReadyForAudioElement(const LabelSamplesMap& label_to_samples,
+                                 const absl::flat_hash_set<ChannelLabel::Label>&
+                                     channel_labels_for_audio_element) {
+  std::optional<size_t> common_num_samples;
+  for (const auto& label : channel_labels_for_audio_element) {
     const auto label_to_samples_iter = label_to_samples.find(label);
     if (label_to_samples_iter == label_to_samples.end()) {
       return false;
     }
 
     const auto num_samples = label_to_samples_iter->second.size();
-    if (common_num_samples == 0 && num_samples != 0) {
+    if (!common_num_samples.has_value()) {
       common_num_samples = num_samples;
-      continue;
     }
 
-    if (num_samples != common_num_samples) {
+    if (num_samples != *common_num_samples) {
       return false;
     }
   }
@@ -369,10 +369,14 @@ std::pair<uint32_t, uint32_t> GetNumSamplesToTrimForFrame(
                         frame_samples_to_trim_at_end);
 }
 
-absl::Status EncodeFramesForAudioElement(
+// Encode frames for an audio element if samples are ready.
+absl::Status MaybeEncodeFramesForAudioElement(
     const DecodedUleb128 audio_element_id,
     const AudioElementWithData& audio_element_with_data,
-    const DemixingModule& demixing_module, LabelSamplesMap& label_to_samples,
+    const DemixingModule& demixing_module,
+    const absl::flat_hash_set<ChannelLabel::Label>&
+        channel_labels_for_audio_element,
+    LabelSamplesMap& label_to_samples,
     absl::flat_hash_map<uint32_t, AudioFrameGenerator::TrimmingState>&
         substream_id_to_trimming_state,
     ParametersManager& parameters_manager,
@@ -381,6 +385,13 @@ absl::Status EncodeFramesForAudioElement(
     absl::flat_hash_map<uint32_t, SubstreamData>&
         substream_id_to_substream_data,
     GlobalTimingModule& global_timing_module) {
+  if (!SamplesReadyForAudioElement(label_to_samples,
+                                   channel_labels_for_audio_element)) {
+    // Waiting for more samples belonging to the same audio element; return
+    // for now.
+    return absl::OkStatus();
+  }
+
   const CodecConfigObu& codec_config = *audio_element_with_data.codec_config;
 
   // Get some common information about this stream.
@@ -626,6 +637,34 @@ absl::Status ValidateAndApplyUserTrimming(
 
 }  // namespace
 
+AudioFrameGenerator::AudioFrameGenerator(
+    const ::google::protobuf::RepeatedPtrField<
+        iamf_tools_cli_proto::AudioFrameObuMetadata>& audio_frame_metadata,
+    const ::google::protobuf::RepeatedPtrField<
+        iamf_tools_cli_proto::CodecConfigObuMetadata>& codec_config_metadata,
+    const absl::flat_hash_map<DecodedUleb128, AudioElementWithData>&
+        audio_elements,
+    const DemixingModule& demixing_module,
+    ParametersManager& parameters_manager,
+    GlobalTimingModule& global_timing_module)
+    : audio_elements_(audio_elements),
+      demixing_module_(demixing_module),
+      parameters_manager_(parameters_manager),
+      global_timing_module_(global_timing_module),
+      // Set to a state NOT taking samples at first; may be changed to
+      // `kTakingSamples` once `Initialize()` is called.
+      state_(kFlushingRemaining) {
+  for (const auto& audio_frame_obu_metadata : audio_frame_metadata) {
+    audio_frame_metadata_[audio_frame_obu_metadata.audio_element_id()] =
+        audio_frame_obu_metadata;
+  }
+
+  for (const auto& codec_config_obu_metadata : codec_config_metadata) {
+    codec_config_metadata_[codec_config_obu_metadata.codec_config_id()] =
+        codec_config_obu_metadata.codec_config();
+  }
+}
+
 absl::StatusOr<uint32_t> AudioFrameGenerator::GetNumberOfSamplesToDelayAtStart(
     const iamf_tools_cli_proto::CodecConfig& codec_config_metadata,
     const CodecConfigObu& codec_config) {
@@ -728,19 +767,32 @@ absl::Status AudioFrameGenerator::Initialize() {
     }
   }
 
+  // If `substream_id_to_substream_data_` is not empty, meaning this generator
+  // is expecting audio substreams and is ready to take audio samples.
+  if (!substream_id_to_substream_data_.empty()) {
+    state_ = kTakingSamples;
+  }
+
   return absl::OkStatus();
 }
 
 bool AudioFrameGenerator::TakingSamples() const {
-  return !substream_id_to_substream_data_.empty();
+  return (state_ == kTakingSamples);
 }
 
 absl::Status AudioFrameGenerator::AddSamples(
     const DecodedUleb128 audio_element_id, ChannelLabel::Label label,
     absl::Span<const InternalSampleType> samples) {
-  const auto& audio_element_labels =
+  absl::MutexLock lock(&mutex_);
+  if (state_ != kTakingSamples) {
+    LOG_FIRST_N(WARNING, 3)
+        << "Calling `AddSamples()` after `Finalize()` has no effect.";
+    return absl::OkStatus();
+  }
+
+  const auto& audio_element_labels_iter =
       audio_element_id_to_labels_.find(audio_element_id);
-  if (audio_element_labels == audio_element_id_to_labels_.end()) {
+  if (audio_element_labels_iter == audio_element_id_to_labels_.end()) {
     return absl::InvalidArgumentError(
         absl::StrCat("No audio frame metadata found for Audio Element ID= ",
                      audio_element_id));
@@ -757,36 +809,20 @@ absl::Status AudioFrameGenerator::AddSamples(
   }
   const auto& audio_element_with_data = audio_element_iter->second;
 
-  if (SamplesReadyForAudioElement(labeled_samples,
-                                  audio_element_labels->second)) {
-    absl::MutexLock lock(&mutex_);
-    RETURN_IF_NOT_OK(EncodeFramesForAudioElement(
-        audio_element_id, audio_element_with_data, demixing_module_,
-        labeled_samples, substream_id_to_trimming_state_, parameters_manager_,
-        substream_id_to_encoder_, substream_id_to_substream_data_,
-        global_timing_module_));
-
-    labeled_samples.clear();
-  }
+  RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
+      audio_element_id, audio_element_with_data, demixing_module_,
+      audio_element_labels_iter->second, labeled_samples,
+      substream_id_to_trimming_state_, parameters_manager_,
+      substream_id_to_encoder_, substream_id_to_substream_data_,
+      global_timing_module_));
 
   return absl::OkStatus();
 }
 
 absl::Status AudioFrameGenerator::Finalize() {
   absl::MutexLock lock(&mutex_);
-  for (auto& [substream_id, encoder] : substream_id_to_encoder_) {
-    auto substream_data_iter =
-        substream_id_to_substream_data_.find(substream_id);
-    if (substream_data_iter == substream_id_to_substream_data_.end()) {
-      continue;
-    }
-
-    // Remove the substream data when there is no more sample to come, and the
-    // encoder can be finalized.
-    if (substream_data_iter->second.samples_obu.empty()) {
-      RETURN_IF_NOT_OK(encoder->Finalize());
-      substream_id_to_substream_data_.erase(substream_data_iter);
-    }
+  if (state_ == kTakingSamples) {
+    state_ = kFinalizedCalled;
   }
 
   return absl::OkStatus();
@@ -801,9 +837,43 @@ absl::Status AudioFrameGenerator::OutputFrames(
     std::list<AudioFrameWithData>& audio_frames) {
   absl::MutexLock lock(&mutex_);
 
+  if (state_ == kFlushingRemaining) {
+    // In this state, there might be some remaining samples queued in the
+    // encoders waiting to be encoded; continue to encode them one frame at a
+    // time.
+    for (const auto& [audio_element_id, audio_element_with_data] :
+         audio_elements_) {
+      RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
+          audio_element_id, audio_element_with_data, demixing_module_,
+          audio_element_id_to_labels_.at(audio_element_id),
+          id_to_labeled_samples_[audio_element_id],
+          substream_id_to_trimming_state_, parameters_manager_,
+          substream_id_to_encoder_, substream_id_to_substream_data_,
+          global_timing_module_));
+    }
+  } else if (state_ == kFinalizedCalled) {
+    // The `Finalize()` has just been called, advance the state so that the
+    // remaining samples will be encoded in the next iteration.
+    state_ = kFlushingRemaining;
+  }
+
+  // Pop encoded audio frames from encoders.
   for (auto substream_id_to_encoder_iter = substream_id_to_encoder_.begin();
        substream_id_to_encoder_iter != substream_id_to_encoder_.end();) {
     auto& [substream_id, encoder] = *substream_id_to_encoder_iter;
+
+    // Remove the substream data when the generator is in the
+    // `kFlushingRemaining` state and the encoder can be finalized.
+    if (state_ == kFlushingRemaining) {
+      auto substream_data_iter =
+          substream_id_to_substream_data_.find(substream_id);
+      if (substream_data_iter != substream_id_to_substream_data_.end() &&
+          substream_data_iter->second.samples_obu.empty()) {
+        RETURN_IF_NOT_OK(encoder->Finalize());
+        substream_id_to_substream_data_.erase(substream_data_iter);
+      }
+    }
+
     if (encoder->FramesAvailable()) {
       RETURN_IF_NOT_OK(encoder->Pop(audio_frames));
       RETURN_IF_NOT_OK(ValidateAndApplyUserTrimming(
diff --git a/iamf/cli/proto_to_obu/audio_frame_generator.h b/iamf/cli/proto_to_obu/audio_frame_generator.h
index fe201fa..2c9ffe0 100644
--- a/iamf/cli/proto_to_obu/audio_frame_generator.h
+++ b/iamf/cli/proto_to_obu/audio_frame_generator.h
@@ -44,20 +44,34 @@ namespace iamf_tools {
  * The generation of audio frames can be done asynchronously, where
  * samples are added on one thread and completed frames are consumed on another.
  *
+ * Under the hood, the generator can be in three states:
+ * 1. `kTakingSamples`: The generator is expecting audio substreams and taking
+ *                      samples.
+ * 2. `kFinalizeCalled`: `Finalize()` has been called; no more "real samples"
+ *                       are coming, and the generator will soon (starting in
+ *                       the next iteration) be flusing the remaining samples.
+ * 3. `kFlushingRemaining`: The generator is flushing the remaining samples
+ *                          that are still in the underlying encoders.
+ *
  * The use pattern of this class is:
  *
  *   - Initialize (`Initialize()`).
+ *     - (This puts the generator in the `kTakingSamples` state.)
  *
  *   Thread 1:
  *   - Repeat until no new sample to add (by checking `TakingSamples()`):
  *     - Add samples for each audio element (`AddSamples()`).
  *   - Finalize the sample-adding process (`Finalize()`).
+ *     - (This puts the generator in the `kFinalizeCalled` state.)
  *
  *   Thread 2:
  *   - Repeat until no frame to generate (by checking `GeneratingFrames()`):
  *     - Output generated frames (`OutputFrames()`).
+ *       - If the generator is in the `kFlushingRemaining` state, the frames
+ *         might come from remaining samples in the underlying encoders.
  *     - If the output is empty, wait.
  *     - Otherwise, add the output of this round to the final result.
+ *
  */
 class AudioFrameGenerator {
  public:
@@ -88,21 +102,7 @@ class AudioFrameGenerator {
           audio_elements,
       const DemixingModule& demixing_module,
       ParametersManager& parameters_manager,
-      GlobalTimingModule& global_timing_module)
-      : audio_elements_(audio_elements),
-        demixing_module_(demixing_module),
-        parameters_manager_(parameters_manager),
-        global_timing_module_(global_timing_module) {
-    for (const auto& audio_frame_obu_metadata : audio_frame_metadata) {
-      audio_frame_metadata_[audio_frame_obu_metadata.audio_element_id()] =
-          audio_frame_obu_metadata;
-    }
-
-    for (const auto& codec_config_obu_metadata : codec_config_metadata) {
-      codec_config_metadata_[codec_config_obu_metadata.codec_config_id()] =
-          codec_config_obu_metadata.codec_config();
-    }
-  }
+      GlobalTimingModule& global_timing_module);
 
   /*!\brief Returns the number of samples to delay based on the codec config.
    *
@@ -129,9 +129,7 @@ class AudioFrameGenerator {
 
   /*!\brief Adds samples for an Audio Element and a channel label.
    *
-   * Calling this function with empty input `samples` will signal the
-   * underlying encoder that the a substream has ended. Eventually when all
-   * substreams are ended, `TakingSamples()` will return false.
+   * No effect if the generator is not in the `kTakingSamples` state.
    *
    * \param audio_element_id Audio Element ID that the added samples belong to.
    * \param label Channel label of the added samples.
@@ -144,8 +142,8 @@ class AudioFrameGenerator {
 
   /*!\brief Finalizes the sample-adding process.
    *
-   * This will signal all underlying encoders that there are no more samples
-   * to come.
+   * This puts the generator in the `kFinalizedCalled` state if it is in the
+   * `kTakingSamples` state. No effect if the generator is in other states.
    *
    * \return `absl::OkStatus()` on success. A specific status on failure.
    */
@@ -163,12 +161,24 @@ class AudioFrameGenerator {
    * The output frames all belong to the same temporal unit, sharing the same
    * start and end timestamps.
    *
+   * After `Finalize()` is called, all underlying encoders will be signalled
+   * to encode the remaining samples. Eventually when all substreams are
+   * are ended, encoders will be deleted and `GeneratingFrames()` will return
+   * false.
+   *
    * \param audio_frames Output list of audio frames.
    * \return `absl::OkStatus()` on success. A specific status on failure.
    */
   absl::Status OutputFrames(std::list<AudioFrameWithData>& audio_frames);
 
  private:
+  // State of an audio frame generator.
+  enum GeneratorState {
+    kTakingSamples,
+    kFinalizedCalled,
+    kFlushingRemaining,
+  };
+
   // Mapping from Audio Element ID to audio frame metadata.
   absl::flat_hash_map<DecodedUleb128,
                       iamf_tools_cli_proto::AudioFrameObuMetadata>
@@ -204,6 +214,7 @@ class AudioFrameGenerator {
   const DemixingModule& demixing_module_;
   ParametersManager& parameters_manager_;
   GlobalTimingModule& global_timing_module_;
+  GeneratorState state_;
 
   // Mutex to protect data accessed in different threads.
   mutable absl::Mutex mutex_;
diff --git a/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc b/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc
index 9f91b6b..f351464 100644
--- a/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc
+++ b/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc
@@ -337,26 +337,22 @@ void AddAllSamplesAndFinalizesExpectOk(
   }
 
   // Push in the user data.
-  for (int frame_count = 0; frame_count < common_num_frames; ++frame_count) {
-    EXPECT_TRUE((audio_frame_generator.TakingSamples()));
+  for (int i = 0; i < common_num_frames; ++i) {
+    // Before `Finalize()` is called, the generator is expected to be taking
+    // samples.
+    EXPECT_TRUE(audio_frame_generator.TakingSamples());
     for (const auto& [label, frames] : label_to_frames) {
-      EXPECT_THAT(audio_frame_generator.AddSamples(audio_element_id, label,
-                                                   frames[frame_count]),
-                  IsOk());
+      EXPECT_THAT(
+          audio_frame_generator.AddSamples(audio_element_id, label, frames[i]),
+          IsOk());
     }
   }
 
-  // Flush out the remaining frames. Several flushes could be required if the
-  // codec delay is longer than a frame duration.
-  while (audio_frame_generator.TakingSamples()) {
-    for (const auto& [label, frames] : label_to_frames) {
-      EXPECT_THAT(audio_frame_generator.AddSamples(audio_element_id, label,
-                                                   kEmptyFrame),
-                  IsOk());
-    }
+  // Call `Finalize()` after the last frame.
+  EXPECT_THAT(audio_frame_generator.Finalize(), IsOk());
 
-    EXPECT_THAT(audio_frame_generator.Finalize(), IsOk());
-  }
+  // Now expect that the generator to NOT be taking samples.
+  EXPECT_FALSE(audio_frame_generator.TakingSamples());
 }
 
 // Safe to run simultaneously with `AddAllSamplesAndFinalizesExpectOk`.
@@ -389,8 +385,7 @@ void GenerateAudioFrameWithEightSamplesExpectOk(
                                 codec_config_obus, audio_elements,
                                 demixing_module, global_timing_module,
                                 parameters_manager, audio_frame_generator);
-  // Add only one "real" frame and an empty frame to signal the end of the
-  // stream.
+  // Add one "real" frame.
   const absl::flat_hash_map<ChannelLabel::Label,
                             std::vector<absl::Span<const InternalSampleType>>>
       label_to_frames = {{ChannelLabel::kL2, {kFrame0L2EightSamples}},
@@ -517,6 +512,55 @@ TEST(AudioFrameGenerator, OneStereoSubstreamOneFrame) {
   ValidateAudioFrames(audio_frames, expected_audio_frames);
 }
 
+TEST(AudioFrameGenerator, AddSamplesAfterFinalizeHasNoEffect) {
+  iamf_tools_cli_proto::UserMetadata user_metadata = {};
+  ConfigureOneStereoSubstreamLittleEndian(user_metadata);
+
+  const absl::flat_hash_map<uint32_t, const ParamDefinition*>
+      param_definitions = {};
+  absl::flat_hash_map<uint32_t, CodecConfigObu> codec_config_obus = {};
+  absl::flat_hash_map<uint32_t, AudioElementWithData> audio_elements = {};
+  DemixingModule demixing_module;
+  GlobalTimingModule global_timing_module;
+  // For delayed initialization.
+  std::optional<ParametersManager> parameters_manager;
+  std::optional<AudioFrameGenerator> audio_frame_generator;
+  InitializeAudioFrameGenerator(user_metadata, param_definitions,
+                                codec_config_obus, audio_elements,
+                                demixing_module, global_timing_module,
+                                parameters_manager, audio_frame_generator);
+
+  // First add one frame of samples and call finalize.
+  const absl::flat_hash_map<ChannelLabel::Label,
+                            std::vector<absl::Span<const InternalSampleType>>>
+      label_to_frames = {{ChannelLabel::kL2, {kFrame0L2EightSamples}},
+                         {ChannelLabel::kR2, {kFrame0R2EightSamples}}};
+  ASSERT_FALSE(audio_elements.empty());
+  const DecodedUleb128 audio_element_id = audio_elements.begin()->first;
+  AddAllSamplesAndFinalizesExpectOk(audio_element_id, label_to_frames,
+                                    *audio_frame_generator);
+
+  // Expect that one frame is generated.
+  std::list<AudioFrameWithData> first_round_audio_frames;
+  FlushAudioFrameGeneratorExpectOk(*audio_frame_generator,
+                                   first_round_audio_frames);
+  EXPECT_EQ(first_round_audio_frames.size(), 1);
+
+  // Add another frame, but since `Finalize()` has been called in the last
+  // round, this has no effect. Therefore we expect that no frame is generated.
+  std::list<AudioFrameWithData> second_round_audio_frames;
+  while (audio_frame_generator->TakingSamples()) {
+    for (const auto& [label, frames] : label_to_frames) {
+      EXPECT_THAT(audio_frame_generator->AddSamples(audio_element_id, label,
+                                                    frames.front()),
+                  IsOk());
+    }
+  }
+  FlushAudioFrameGeneratorExpectOk(*audio_frame_generator,
+                                   second_round_audio_frames);
+  EXPECT_TRUE(second_round_audio_frames.empty());
+}
+
 TEST(AudioFrameGenerator, AllowsOutputToHaveHigherBitDepthThanInput) {
   iamf_tools_cli_proto::UserMetadata user_metadata = {};
   ConfigureOneStereoSubstreamLittleEndian(user_metadata);