Add a flag to separate trimming the initial frames, from inserting co…

…dec delay. - Add `samples_to_trim_at_start_includes_codec_delay` as an analog of `samples_to_trim_at_end_includes_padding`. - When `true` we get the old behavior. In the mode the user effectively specifies the cumulative `num_samples_to_trim_at_start` from the initial Audio Frame OBUs. - The old mode conflates the "codec delay" and trimming the input audio into one field. - When `false` the codec delay is automatically added; the user only specifies the number of samples to trim from the input audio. - Update templates to use the new flag. - Naturally most end-users will want to preserve their input audio. - The new templates always preserves the user input audio. Even when the user changes the codec config, or the codec libraries are updated and result in different required codec delay. PiperOrigin-RevId: 686143666
AOMediaCodec · Oct 16, 2024 · f4e6706 · f4e6706
1 parent 42e395e
commit f4e6706
Show file tree

Hide file tree

Showing 19 changed files with 153 additions and 23 deletions.
diff --git a/iamf/cli/proto/audio_frame.proto b/iamf/cli/proto/audio_frame.proto
@@ -110,10 +110,13 @@ message AudioFrameObuMetadata {
   //
   // When `true` `samples_to_trim_at_end` matches `num_samples_to_trim_at_end`
   // in the OBU header. If the actual value is too small to account for padding,
-  // then encoding will fail.
+  // then encoding will fail. This is useful to closely represent the value in
+  // the OBU (rarely needed).
   //
   // When `false` `num_samples_to_trim_at_end` will be incremented by the
-  // amount of required padding and may not match the OBU header value.
+  // amount of required padding and may not match the OBU header value. This is
+  // useful to closely represent the trimming applied to the original audio
+  // content (typically recommended).
   //
   // Typically the recommended settings are:
   //   - samples_to_trim_at_end_includes_padding: false
@@ -123,6 +126,28 @@ message AudioFrameObuMetadata {
   optional bool samples_to_trim_at_end_includes_padding = 9 [default = true];
   optional uint32 samples_to_trim_at_end = 4;
 
+  // Controls whether the number of samples required by codec delay for the
+  // initial first frames are automatically added to the
+  // `samples_to_trim_at_start` value.
+  //
+  // When `true` `samples_to_trim_at_start` relates directly to
+  // `samples_to_trim_at_start`s in the initial OBU header. If the actual
+  // value is too small to account for codec delay, then encoding will fail.
+  // This is useful to closely represent the values in the initial OBUs (rarely
+  // needed).
+  //
+  // When `false` `samples_to_trim_at_start` will be incremented by the
+  // amount of required for the codec delay and may not match the OBU header
+  // value. This is useful to closely represent the trimming applied to the
+  // original audio content (typically recommended).
+  //
+  // Typically the recommended settings are:
+  //   - samples_to_trim_at_start_includes_codec_delay: false
+  //   - samples_to_trim_at_start: 0
+  // These settings will automatically insert the correct codec delay to create
+  // a valid final frame and avoid trimming the original audio content.
+  optional bool samples_to_trim_at_start_includes_codec_delay = 10
+      [default = true];
   optional uint32 samples_to_trim_at_start = 5;
 
   // ID of the audio element whose substreams will be provided/mixed
@@ -139,5 +164,5 @@ message AudioFrameObuMetadata {
   // e.g. "A0", "A13". Must be of the same length as `channel_ids`.
   repeated string channel_labels = 8;
 
-  // Next ID: 10
+  // Next ID: 11
 }
diff --git a/iamf/cli/proto_to_obu/audio_frame_generator.cc b/iamf/cli/proto_to_obu/audio_frame_generator.cc
@@ -125,7 +125,7 @@ absl::Status GetEncodingDataAndInitializeEncoders(
 
 // Validates that the user requested number of samples to trim at start is
 // enough to cover the delay that the encoder needs.
-absl::Status ValidateUserStartTrim(
+absl::Status ValidateUserStartTrimIncludesCodecDelay(
     uint32_t user_samples_to_trim_at_start,
     uint32_t& encoder_required_samples_to_delay) {
   // Return an error. But obey the user when `-DNO_CHECK_ERROR` is set.
@@ -189,6 +189,7 @@ absl::Status InitializeSubstreamData(
     const SubstreamIdLabelsMap& substream_id_to_labels,
     const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
         substream_id_to_encoder,
+    bool user_samples_to_trim_at_start_includes_codec_delay,
     const uint32_t user_samples_to_trim_at_start,
     absl::flat_hash_map<uint32_t, SubstreamData>&
         substream_id_to_substream_data) {
@@ -206,8 +207,10 @@ absl::Status InitializeSubstreamData(
 
     uint32_t encoder_required_samples_to_delay =
         encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
-    RETURN_IF_NOT_OK(ValidateUserStartTrim(user_samples_to_trim_at_start,
-                                           encoder_required_samples_to_delay));
+    if (user_samples_to_trim_at_start_includes_codec_delay) {
+      RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
+          user_samples_to_trim_at_start, encoder_required_samples_to_delay));
+    }
 
     // Initialize a `SubstreamData` with virtual samples for any delay
     // introduced by the encoder.
@@ -529,14 +532,17 @@ absl::Status EncodeFramesForAudioElement(
 absl::Status ValidateSubstreamsShareTrimming(
     const iamf_tools_cli_proto::AudioFrameObuMetadata& audio_frame_metadata,
     bool common_samples_to_trim_at_end_includes_padding,
+    bool common_samples_to_trim_at_start_includes_codec_delay,
     int64_t common_samples_to_trim_at_start,
     int64_t common_samples_to_trim_at_end) {
   if (audio_frame_metadata.samples_to_trim_at_end() !=
           common_samples_to_trim_at_end ||
       audio_frame_metadata.samples_to_trim_at_start() !=
           common_samples_to_trim_at_start ||
       audio_frame_metadata.samples_to_trim_at_end_includes_padding() !=
-          common_samples_to_trim_at_end_includes_padding) {
+          common_samples_to_trim_at_end_includes_padding ||
+      audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay() !=
+          common_samples_to_trim_at_start_includes_codec_delay) {
     return absl::InvalidArgumentError(
         "Expected all substreams to have the same trimming information");
   }
@@ -644,6 +650,9 @@ absl::Status AudioFrameGenerator::Initialize() {
       static_cast<int64_t>(first_audio_frame_metadata.samples_to_trim_at_end());
   const bool common_samples_to_trim_at_end_includes_padding =
       first_audio_frame_metadata.samples_to_trim_at_end_includes_padding();
+  const bool common_samples_to_trim_at_start_includes_codec_delay =
+      first_audio_frame_metadata
+          .samples_to_trim_at_start_includes_codec_delay();
 
   for (const auto& [audio_element_id, audio_frame_metadata] :
        audio_frame_metadata_) {
@@ -669,11 +678,12 @@ absl::Status AudioFrameGenerator::Initialize() {
         substream_id_to_encoder_));
 
     // Intermediate data for all substreams belonging to an Audio Element.
-    RETURN_IF_NOT_OK(
-        InitializeSubstreamData(audio_element_with_data.substream_id_to_labels,
-                                substream_id_to_encoder_,
-                                audio_frame_metadata.samples_to_trim_at_start(),
-                                substream_id_to_substream_data_));
+    RETURN_IF_NOT_OK(InitializeSubstreamData(
+        audio_element_with_data.substream_id_to_labels,
+        substream_id_to_encoder_,
+        audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
+        audio_frame_metadata.samples_to_trim_at_start(),
+        substream_id_to_substream_data_));
 
     // Validate that a `DemixingParamDefinition` is available if down-mixing
     // is needed.
@@ -691,16 +701,25 @@ absl::Status AudioFrameGenerator::Initialize() {
     // Validate the assumption that trimming is the same for all substreams.
     RETURN_IF_NOT_OK(ValidateSubstreamsShareTrimming(
         audio_frame_metadata, common_samples_to_trim_at_end_includes_padding,
+        common_samples_to_trim_at_start_includes_codec_delay,
         common_samples_to_trim_at_start, common_samples_to_trim_at_end));
 
     // Populate the map of trimming states with all substream ID.
     for (const auto& [substream_id, labels] :
          audio_element_with_data.substream_id_to_labels) {
+      // Add in the codec delay when it was not included in the user input.
+      const int64_t additional_samples_to_trim_at_start =
+          common_samples_to_trim_at_start_includes_codec_delay
+              ? 0
+              : substream_id_to_encoder_[substream_id]
+                    ->GetNumberOfSamplesToDelayAtStart();
       substream_id_to_trimming_state_[substream_id] = {
           .increment_samples_to_trim_at_end_by_padding =
               !audio_frame_metadata.samples_to_trim_at_end_includes_padding(),
           .user_samples_left_to_trim_at_end = common_samples_to_trim_at_end,
-          .user_samples_left_to_trim_at_start = common_samples_to_trim_at_start,
+          .user_samples_left_to_trim_at_start =
+              common_samples_to_trim_at_start +
+              additional_samples_to_trim_at_start,
       };
     }
   }

diff --git a/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc b/iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc
@@ -59,6 +59,9 @@ constexpr uint32_t kSampleRate = 48000;
 constexpr uint32_t kAacNumSamplesPerFrame = 1024;
 constexpr uint32_t kAacNumSamplesToTrimAtStart = 2048;
 
+constexpr bool kSamplesToTrimAtStartIncludesCodecDelay = true;
+constexpr bool kSamplesToTrimAtStartExcludesCodecDelay = false;
+
 constexpr auto kFrame0L2EightSamples = std::to_array<InternalSampleType>(
     {1 << 16, 2 << 16, 3 << 16, 4 << 16, 5 << 16, 6 << 16, 7 << 16, 8 << 16});
 constexpr auto kFrame0R2EightSamples = std::to_array<InternalSampleType>(
@@ -580,6 +583,7 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
   const uint32_t kCommonNumSamplesToTrimAtStart = 2;
   const uint32_t kCommonNumSamplesToTrimAtEnd = 1;
   const bool kCommonSamplesToTrimAtEndIncludesPadding = true;
+  const bool kCommonSamplesToTrimAtStartIncludesCodecDelay = true;
   user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
       kCommonNumSamplesToTrimAtStart);
   user_metadata.mutable_audio_frame_metadata(1)->set_samples_to_trim_at_start(
@@ -594,6 +598,12 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
   user_metadata.mutable_audio_frame_metadata(1)
       ->set_samples_to_trim_at_end_includes_padding(
           kCommonSamplesToTrimAtEndIncludesPadding);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(
+          kCommonSamplesToTrimAtStartIncludesCodecDelay);
+  user_metadata.mutable_audio_frame_metadata(1)
+      ->set_samples_to_trim_at_end_includes_padding(
+          kCommonSamplesToTrimAtStartIncludesCodecDelay);
 
   std::list<AudioFrameWithData> audio_frames;
   GenerateAudioFrameWithEightSamplesExpectOk(user_metadata, audio_frames);
@@ -652,6 +662,22 @@ TEST(AudioFrameGenerator,
   ExpectAudioFrameGeneratorInitializeIsNotOk(user_metadata);
 }
 
+TEST(AudioFrameGenerator,
+     ErrorAudioElementsMustHaveSameSamplesToTrimAtStartIncludesCodecDelay) {
+  iamf_tools_cli_proto::UserMetadata user_metadata = {};
+  ConfigureOneStereoSubstreamLittleEndian(user_metadata);
+  AddStereoAudioElementAndAudioFrameMetadata(
+      user_metadata, kSecondAudioElementId, kSecondSubstreamId);
+  // IAMF requires that all audio elements have the same number of samples
+  // trimmed at the start.
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(false);
+  user_metadata.mutable_audio_frame_metadata(1)
+      ->set_samples_to_trim_at_start_includes_codec_delay(true);
+
+  ExpectAudioFrameGeneratorInitializeIsNotOk(user_metadata);
+}
+
 TEST(AudioFrameGenerator, NumSamplesToTrimAtEndWithPaddedFrames) {
   iamf_tools_cli_proto::UserMetadata user_metadata = {};
   ConfigureOneStereoSubstreamLittleEndian(user_metadata);
@@ -842,6 +868,9 @@ TEST(AudioFrameGenerator, EncodingSucceedsWithFullFramesTrimmedAtStart) {
       *user_metadata.mutable_codec_config_metadata()->Add());
   AddStereoAudioElementAndAudioFrameMetadata(
       user_metadata, kFirstAudioElementId, kFirstSubstreamId);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(
+          kSamplesToTrimAtStartIncludesCodecDelay);
   user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
       kAacNumSamplesToTrimAtStart);
   user_metadata.mutable_audio_frame_metadata(0)
@@ -874,6 +903,9 @@ TEST(AudioFrameGenerator, TrimsAdditionalSamplesAtStart) {
       *user_metadata.mutable_codec_config_metadata()->Add());
   AddStereoAudioElementAndAudioFrameMetadata(
       user_metadata, kFirstAudioElementId, kFirstSubstreamId);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(
+          kSamplesToTrimAtStartIncludesCodecDelay);
   user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
       kNumSamplesToTrimAtStart);
   user_metadata.mutable_audio_frame_metadata(0)
@@ -895,6 +927,40 @@ TEST(AudioFrameGenerator, TrimsAdditionalSamplesAtStart) {
             kNumSamplesToTrimAtStart);
 }
 
+TEST(AudioFrameGenerator, AddsCodecDelayToSamplesToTrimAtStartWhenRequested) {
+  iamf_tools_cli_proto::UserMetadata user_metadata = {};
+  ConfigureAacCodecConfigMetadata(
+      *user_metadata.mutable_codec_config_metadata()->Add());
+  AddStereoAudioElementAndAudioFrameMetadata(
+      user_metadata, kFirstAudioElementId, kFirstSubstreamId);
+  // Request one sample to be trimmed. In addition to the codec delay.
+  constexpr uint32_t kNumSamplesToTrimAtStart = 1;
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_start_includes_codec_delay(
+          kSamplesToTrimAtStartExcludesCodecDelay);
+  user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
+      kNumSamplesToTrimAtStart);
+  user_metadata.mutable_audio_frame_metadata(0)
+      ->set_samples_to_trim_at_end_includes_padding(false);
+
+  std::list<AudioFrameWithData> audio_frames;
+  GenerateAudioFrameWithEightSamplesExpectOk(user_metadata, audio_frames);
+
+  uint32_t observed_cumulative_samples_to_trim_at_start = 0;
+  uint32_t unused_common_samples_to_trim_at_end = 0;
+  ASSERT_THAT(
+      ValidateAndGetCommonTrim(kAacNumSamplesPerFrame, audio_frames,
+                               unused_common_samples_to_trim_at_end,
+                               observed_cumulative_samples_to_trim_at_start),
+      IsOk());
+  // The actual cumulative trim values in the OBU include both the codec delay
+  // and the user requested trim.
+  constexpr uint32_t kExpectedNumSamplesToTrimAtStart =
+      kAacNumSamplesToTrimAtStart + kNumSamplesToTrimAtStart;
+  EXPECT_EQ(observed_cumulative_samples_to_trim_at_start,
+            kExpectedNumSamplesToTrimAtStart);
+}
+
 TEST(AudioFrameGenerator, InitFailsWithTooFewSamplesToTrimAtStart) {
   const uint32_t kInvalidNumSamplesToTrimAtStart =
       kAacNumSamplesToTrimAtStart - 1;

diff --git a/iamf/cli/textproto_templates/1OA_and_stereo_opus.textproto b/iamf/cli/textproto_templates/1OA_and_stereo_opus.textproto
@@ -153,17 +153,19 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_FOA.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
-    samples_to_trim_at_start: 312
+    samples_to_trim_at_start: 0
     audio_element_id: 300
     channel_ids: [0, 1, 2, 3]
     channel_labels: ["A0", "A1", "A2", "A3"]
   },
   {
     wav_filename: "TEMPLATE_stereo.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
-    samples_to_trim_at_start: 312
+    samples_to_trim_at_start: 0
     audio_element_id: 301
     channel_ids: [0, 1]
     channel_labels: ["L2", "R2"]

diff --git a/iamf/cli/textproto_templates/1OA_and_stereo_pcm24bit.textproto b/iamf/cli/textproto_templates/1OA_and_stereo_pcm24bit.textproto
@@ -149,6 +149,7 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_FOA.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
     samples_to_trim_at_start: 0
     audio_element_id: 300
@@ -158,6 +159,7 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_stereo.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
     samples_to_trim_at_start: 0
     audio_element_id: 301

diff --git a/iamf/cli/textproto_templates/1OA_opus.textproto b/iamf/cli/textproto_templates/1OA_opus.textproto
@@ -109,8 +109,9 @@ mix_presentation_metadata {
 audio_frame_metadata {
   wav_filename: "TEMPLATE_FOA.wav"
   samples_to_trim_at_end_includes_padding: false
+  samples_to_trim_at_start_includes_codec_delay: false
   samples_to_trim_at_end: 0
-  samples_to_trim_at_start: 312
+  samples_to_trim_at_start: 0
   audio_element_id: 300
   channel_ids: [0, 1, 2, 3]
   channel_labels: ["A0", "A1", "A2", "A3"]

diff --git a/iamf/cli/textproto_templates/1OA_pcm24bit.textproto b/iamf/cli/textproto_templates/1OA_pcm24bit.textproto
@@ -105,6 +105,7 @@ mix_presentation_metadata {
 audio_frame_metadata {
   wav_filename: "TEMPLATE_FOA.wav"
   samples_to_trim_at_end_includes_padding: false
+  samples_to_trim_at_start_includes_codec_delay: false
   samples_to_trim_at_end: 0
   samples_to_trim_at_start: 0
   audio_element_id: 300

diff --git a/iamf/cli/textproto_templates/3OA_and_stereo_opus.textproto b/iamf/cli/textproto_templates/3OA_and_stereo_opus.textproto
@@ -177,17 +177,19 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_TOA.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
-    samples_to_trim_at_start: 312
+    samples_to_trim_at_start: 0
     audio_element_id: 300
     channel_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
     channel_labels: ["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]
   },
   {
     wav_filename: "TEMPLATE_stereo.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
-    samples_to_trim_at_start: 312
+    samples_to_trim_at_start: 0
     audio_element_id: 301
     channel_ids: [0, 1]
     channel_labels: ["L2", "R2"]

diff --git a/iamf/cli/textproto_templates/3OA_and_stereo_pcm24bit.textproto b/iamf/cli/textproto_templates/3OA_and_stereo_pcm24bit.textproto
@@ -173,6 +173,7 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_TOA.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
     samples_to_trim_at_start: 0
     audio_element_id: 300
@@ -182,6 +183,7 @@ audio_frame_metadata: [
   {
     wav_filename: "TEMPLATE_stereo.wav"
     samples_to_trim_at_end_includes_padding: false
+    samples_to_trim_at_start_includes_codec_delay: false
     samples_to_trim_at_end: 0
     samples_to_trim_at_start: 0
     audio_element_id: 301

diff --git a/iamf/cli/textproto_templates/3OA_opus.textproto b/iamf/cli/textproto_templates/3OA_opus.textproto
@@ -133,8 +133,9 @@ mix_presentation_metadata {
 audio_frame_metadata {
   wav_filename: "TEMPLATE_TOA.wav"
   samples_to_trim_at_end_includes_padding: false
+  samples_to_trim_at_start_includes_codec_delay: false
   samples_to_trim_at_end: 0
-  samples_to_trim_at_start: 312
+  samples_to_trim_at_start: 0
   audio_element_id: 300
   channel_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
   channel_labels: ["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]

diff --git a/iamf/cli/textproto_templates/3OA_pcm24bit.textproto b/iamf/cli/textproto_templates/3OA_pcm24bit.textproto
@@ -129,6 +129,7 @@ mix_presentation_metadata {
 audio_frame_metadata {
   wav_filename: "TEMPLATE_TOA.wav"
   samples_to_trim_at_end_includes_padding: false
+  samples_to_trim_at_start_includes_codec_delay: false
   samples_to_trim_at_end: 0
   samples_to_trim_at_start: 0
   audio_element_id: 300

diff --git a/iamf/cli/textproto_templates/5dot1_opus.textproto b/iamf/cli/textproto_templates/5dot1_opus.textproto
@@ -127,8 +127,9 @@ mix_presentation_metadata {
 audio_frame_metadata {
   wav_filename: "TEMPLATE_5dot1.wav"
   samples_to_trim_at_end_includes_padding: false
+  samples_to_trim_at_start_includes_codec_delay: false
   samples_to_trim_at_end: 0
-  samples_to_trim_at_start: 312
+  samples_to_trim_at_start: 0
   audio_element_id: 300
   channel_ids: [0, 1, 2, 3, 4, 5]
   channel_labels: ["L5", "R5", "C", "LFE", "Ls5", "Rs5"]