Skip to content

Commit

Permalink
Add a flag to separate trimming the initial frames, from inserting co…
Browse files Browse the repository at this point in the history
…dec delay.

  - Add `samples_to_trim_at_start_includes_codec_delay` as an analog of `samples_to_trim_at_end_includes_padding`.
    - When `true` we get the old behavior. In the mode the user effectively specifies the cumulative `num_samples_to_trim_at_start` from the initial Audio Frame OBUs.
    - The old mode conflates the "codec delay" and trimming the input audio into one field.
    - When `false` the codec delay is automatically added; the user only specifies the number of samples to trim from the input audio.
  - Update templates to use the new flag.
    - Naturally most end-users will want to preserve their input audio.
    - The new templates always preserves the user input audio. Even when the user changes the codec config, or the codec libraries are updated and result in different required codec delay.

PiperOrigin-RevId: 686143666
  • Loading branch information
jwcullen authored and felicialim committed Oct 16, 2024
1 parent 42e395e commit f4e6706
Show file tree
Hide file tree
Showing 19 changed files with 153 additions and 23 deletions.
31 changes: 28 additions & 3 deletions iamf/cli/proto/audio_frame.proto
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,13 @@ message AudioFrameObuMetadata {
//
// When `true` `samples_to_trim_at_end` matches `num_samples_to_trim_at_end`
// in the OBU header. If the actual value is too small to account for padding,
// then encoding will fail.
// then encoding will fail. This is useful to closely represent the value in
// the OBU (rarely needed).
//
// When `false` `num_samples_to_trim_at_end` will be incremented by the
// amount of required padding and may not match the OBU header value.
// amount of required padding and may not match the OBU header value. This is
// useful to closely represent the trimming applied to the original audio
// content (typically recommended).
//
// Typically the recommended settings are:
// - samples_to_trim_at_end_includes_padding: false
Expand All @@ -123,6 +126,28 @@ message AudioFrameObuMetadata {
optional bool samples_to_trim_at_end_includes_padding = 9 [default = true];
optional uint32 samples_to_trim_at_end = 4;

// Controls whether the number of samples required by codec delay for the
// initial first frames are automatically added to the
// `samples_to_trim_at_start` value.
//
// When `true` `samples_to_trim_at_start` relates directly to
// `samples_to_trim_at_start`s in the initial OBU header. If the actual
// value is too small to account for codec delay, then encoding will fail.
// This is useful to closely represent the values in the initial OBUs (rarely
// needed).
//
// When `false` `samples_to_trim_at_start` will be incremented by the
// amount of required for the codec delay and may not match the OBU header
// value. This is useful to closely represent the trimming applied to the
// original audio content (typically recommended).
//
// Typically the recommended settings are:
// - samples_to_trim_at_start_includes_codec_delay: false
// - samples_to_trim_at_start: 0
// These settings will automatically insert the correct codec delay to create
// a valid final frame and avoid trimming the original audio content.
optional bool samples_to_trim_at_start_includes_codec_delay = 10
[default = true];
optional uint32 samples_to_trim_at_start = 5;

// ID of the audio element whose substreams will be provided/mixed
Expand All @@ -139,5 +164,5 @@ message AudioFrameObuMetadata {
// e.g. "A0", "A13". Must be of the same length as `channel_ids`.
repeated string channel_labels = 8;

// Next ID: 10
// Next ID: 11
}
39 changes: 29 additions & 10 deletions iamf/cli/proto_to_obu/audio_frame_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ absl::Status GetEncodingDataAndInitializeEncoders(

// Validates that the user requested number of samples to trim at start is
// enough to cover the delay that the encoder needs.
absl::Status ValidateUserStartTrim(
absl::Status ValidateUserStartTrimIncludesCodecDelay(
uint32_t user_samples_to_trim_at_start,
uint32_t& encoder_required_samples_to_delay) {
// Return an error. But obey the user when `-DNO_CHECK_ERROR` is set.
Expand Down Expand Up @@ -189,6 +189,7 @@ absl::Status InitializeSubstreamData(
const SubstreamIdLabelsMap& substream_id_to_labels,
const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
substream_id_to_encoder,
bool user_samples_to_trim_at_start_includes_codec_delay,
const uint32_t user_samples_to_trim_at_start,
absl::flat_hash_map<uint32_t, SubstreamData>&
substream_id_to_substream_data) {
Expand All @@ -206,8 +207,10 @@ absl::Status InitializeSubstreamData(

uint32_t encoder_required_samples_to_delay =
encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
RETURN_IF_NOT_OK(ValidateUserStartTrim(user_samples_to_trim_at_start,
encoder_required_samples_to_delay));
if (user_samples_to_trim_at_start_includes_codec_delay) {
RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
user_samples_to_trim_at_start, encoder_required_samples_to_delay));
}

// Initialize a `SubstreamData` with virtual samples for any delay
// introduced by the encoder.
Expand Down Expand Up @@ -529,14 +532,17 @@ absl::Status EncodeFramesForAudioElement(
absl::Status ValidateSubstreamsShareTrimming(
const iamf_tools_cli_proto::AudioFrameObuMetadata& audio_frame_metadata,
bool common_samples_to_trim_at_end_includes_padding,
bool common_samples_to_trim_at_start_includes_codec_delay,
int64_t common_samples_to_trim_at_start,
int64_t common_samples_to_trim_at_end) {
if (audio_frame_metadata.samples_to_trim_at_end() !=
common_samples_to_trim_at_end ||
audio_frame_metadata.samples_to_trim_at_start() !=
common_samples_to_trim_at_start ||
audio_frame_metadata.samples_to_trim_at_end_includes_padding() !=
common_samples_to_trim_at_end_includes_padding) {
common_samples_to_trim_at_end_includes_padding ||
audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay() !=
common_samples_to_trim_at_start_includes_codec_delay) {
return absl::InvalidArgumentError(
"Expected all substreams to have the same trimming information");
}
Expand Down Expand Up @@ -644,6 +650,9 @@ absl::Status AudioFrameGenerator::Initialize() {
static_cast<int64_t>(first_audio_frame_metadata.samples_to_trim_at_end());
const bool common_samples_to_trim_at_end_includes_padding =
first_audio_frame_metadata.samples_to_trim_at_end_includes_padding();
const bool common_samples_to_trim_at_start_includes_codec_delay =
first_audio_frame_metadata
.samples_to_trim_at_start_includes_codec_delay();

for (const auto& [audio_element_id, audio_frame_metadata] :
audio_frame_metadata_) {
Expand All @@ -669,11 +678,12 @@ absl::Status AudioFrameGenerator::Initialize() {
substream_id_to_encoder_));

// Intermediate data for all substreams belonging to an Audio Element.
RETURN_IF_NOT_OK(
InitializeSubstreamData(audio_element_with_data.substream_id_to_labels,
substream_id_to_encoder_,
audio_frame_metadata.samples_to_trim_at_start(),
substream_id_to_substream_data_));
RETURN_IF_NOT_OK(InitializeSubstreamData(
audio_element_with_data.substream_id_to_labels,
substream_id_to_encoder_,
audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
audio_frame_metadata.samples_to_trim_at_start(),
substream_id_to_substream_data_));

// Validate that a `DemixingParamDefinition` is available if down-mixing
// is needed.
Expand All @@ -691,16 +701,25 @@ absl::Status AudioFrameGenerator::Initialize() {
// Validate the assumption that trimming is the same for all substreams.
RETURN_IF_NOT_OK(ValidateSubstreamsShareTrimming(
audio_frame_metadata, common_samples_to_trim_at_end_includes_padding,
common_samples_to_trim_at_start_includes_codec_delay,
common_samples_to_trim_at_start, common_samples_to_trim_at_end));

// Populate the map of trimming states with all substream ID.
for (const auto& [substream_id, labels] :
audio_element_with_data.substream_id_to_labels) {
// Add in the codec delay when it was not included in the user input.
const int64_t additional_samples_to_trim_at_start =
common_samples_to_trim_at_start_includes_codec_delay
? 0
: substream_id_to_encoder_[substream_id]
->GetNumberOfSamplesToDelayAtStart();
substream_id_to_trimming_state_[substream_id] = {
.increment_samples_to_trim_at_end_by_padding =
!audio_frame_metadata.samples_to_trim_at_end_includes_padding(),
.user_samples_left_to_trim_at_end = common_samples_to_trim_at_end,
.user_samples_left_to_trim_at_start = common_samples_to_trim_at_start,
.user_samples_left_to_trim_at_start =
common_samples_to_trim_at_start +
additional_samples_to_trim_at_start,
};
}
}
Expand Down
66 changes: 66 additions & 0 deletions iamf/cli/proto_to_obu/tests/audio_frame_generator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ constexpr uint32_t kSampleRate = 48000;
constexpr uint32_t kAacNumSamplesPerFrame = 1024;
constexpr uint32_t kAacNumSamplesToTrimAtStart = 2048;

constexpr bool kSamplesToTrimAtStartIncludesCodecDelay = true;
constexpr bool kSamplesToTrimAtStartExcludesCodecDelay = false;

constexpr auto kFrame0L2EightSamples = std::to_array<InternalSampleType>(
{1 << 16, 2 << 16, 3 << 16, 4 << 16, 5 << 16, 6 << 16, 7 << 16, 8 << 16});
constexpr auto kFrame0R2EightSamples = std::to_array<InternalSampleType>(
Expand Down Expand Up @@ -580,6 +583,7 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
const uint32_t kCommonNumSamplesToTrimAtStart = 2;
const uint32_t kCommonNumSamplesToTrimAtEnd = 1;
const bool kCommonSamplesToTrimAtEndIncludesPadding = true;
const bool kCommonSamplesToTrimAtStartIncludesCodecDelay = true;
user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
kCommonNumSamplesToTrimAtStart);
user_metadata.mutable_audio_frame_metadata(1)->set_samples_to_trim_at_start(
Expand All @@ -594,6 +598,12 @@ TEST(AudioFrameGenerator, AllAudioElementsHaveMatchingTrimmingInformation) {
user_metadata.mutable_audio_frame_metadata(1)
->set_samples_to_trim_at_end_includes_padding(
kCommonSamplesToTrimAtEndIncludesPadding);
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_start_includes_codec_delay(
kCommonSamplesToTrimAtStartIncludesCodecDelay);
user_metadata.mutable_audio_frame_metadata(1)
->set_samples_to_trim_at_end_includes_padding(
kCommonSamplesToTrimAtStartIncludesCodecDelay);

std::list<AudioFrameWithData> audio_frames;
GenerateAudioFrameWithEightSamplesExpectOk(user_metadata, audio_frames);
Expand Down Expand Up @@ -652,6 +662,22 @@ TEST(AudioFrameGenerator,
ExpectAudioFrameGeneratorInitializeIsNotOk(user_metadata);
}

TEST(AudioFrameGenerator,
ErrorAudioElementsMustHaveSameSamplesToTrimAtStartIncludesCodecDelay) {
iamf_tools_cli_proto::UserMetadata user_metadata = {};
ConfigureOneStereoSubstreamLittleEndian(user_metadata);
AddStereoAudioElementAndAudioFrameMetadata(
user_metadata, kSecondAudioElementId, kSecondSubstreamId);
// IAMF requires that all audio elements have the same number of samples
// trimmed at the start.
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_start_includes_codec_delay(false);
user_metadata.mutable_audio_frame_metadata(1)
->set_samples_to_trim_at_start_includes_codec_delay(true);

ExpectAudioFrameGeneratorInitializeIsNotOk(user_metadata);
}

TEST(AudioFrameGenerator, NumSamplesToTrimAtEndWithPaddedFrames) {
iamf_tools_cli_proto::UserMetadata user_metadata = {};
ConfigureOneStereoSubstreamLittleEndian(user_metadata);
Expand Down Expand Up @@ -842,6 +868,9 @@ TEST(AudioFrameGenerator, EncodingSucceedsWithFullFramesTrimmedAtStart) {
*user_metadata.mutable_codec_config_metadata()->Add());
AddStereoAudioElementAndAudioFrameMetadata(
user_metadata, kFirstAudioElementId, kFirstSubstreamId);
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_start_includes_codec_delay(
kSamplesToTrimAtStartIncludesCodecDelay);
user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
kAacNumSamplesToTrimAtStart);
user_metadata.mutable_audio_frame_metadata(0)
Expand Down Expand Up @@ -874,6 +903,9 @@ TEST(AudioFrameGenerator, TrimsAdditionalSamplesAtStart) {
*user_metadata.mutable_codec_config_metadata()->Add());
AddStereoAudioElementAndAudioFrameMetadata(
user_metadata, kFirstAudioElementId, kFirstSubstreamId);
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_start_includes_codec_delay(
kSamplesToTrimAtStartIncludesCodecDelay);
user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
kNumSamplesToTrimAtStart);
user_metadata.mutable_audio_frame_metadata(0)
Expand All @@ -895,6 +927,40 @@ TEST(AudioFrameGenerator, TrimsAdditionalSamplesAtStart) {
kNumSamplesToTrimAtStart);
}

TEST(AudioFrameGenerator, AddsCodecDelayToSamplesToTrimAtStartWhenRequested) {
iamf_tools_cli_proto::UserMetadata user_metadata = {};
ConfigureAacCodecConfigMetadata(
*user_metadata.mutable_codec_config_metadata()->Add());
AddStereoAudioElementAndAudioFrameMetadata(
user_metadata, kFirstAudioElementId, kFirstSubstreamId);
// Request one sample to be trimmed. In addition to the codec delay.
constexpr uint32_t kNumSamplesToTrimAtStart = 1;
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_start_includes_codec_delay(
kSamplesToTrimAtStartExcludesCodecDelay);
user_metadata.mutable_audio_frame_metadata(0)->set_samples_to_trim_at_start(
kNumSamplesToTrimAtStart);
user_metadata.mutable_audio_frame_metadata(0)
->set_samples_to_trim_at_end_includes_padding(false);

std::list<AudioFrameWithData> audio_frames;
GenerateAudioFrameWithEightSamplesExpectOk(user_metadata, audio_frames);

uint32_t observed_cumulative_samples_to_trim_at_start = 0;
uint32_t unused_common_samples_to_trim_at_end = 0;
ASSERT_THAT(
ValidateAndGetCommonTrim(kAacNumSamplesPerFrame, audio_frames,
unused_common_samples_to_trim_at_end,
observed_cumulative_samples_to_trim_at_start),
IsOk());
// The actual cumulative trim values in the OBU include both the codec delay
// and the user requested trim.
constexpr uint32_t kExpectedNumSamplesToTrimAtStart =
kAacNumSamplesToTrimAtStart + kNumSamplesToTrimAtStart;
EXPECT_EQ(observed_cumulative_samples_to_trim_at_start,
kExpectedNumSamplesToTrimAtStart);
}

TEST(AudioFrameGenerator, InitFailsWithTooFewSamplesToTrimAtStart) {
const uint32_t kInvalidNumSamplesToTrimAtStart =
kAacNumSamplesToTrimAtStart - 1;
Expand Down
6 changes: 4 additions & 2 deletions iamf/cli/textproto_templates/1OA_and_stereo_opus.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -153,17 +153,19 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_FOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 300
channel_ids: [0, 1, 2, 3]
channel_labels: ["A0", "A1", "A2", "A3"]
},
{
wav_filename: "TEMPLATE_stereo.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 301
channel_ids: [0, 1]
channel_labels: ["L2", "R2"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_FOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 300
Expand All @@ -158,6 +159,7 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_stereo.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 301
Expand Down
3 changes: 2 additions & 1 deletion iamf/cli/textproto_templates/1OA_opus.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@ mix_presentation_metadata {
audio_frame_metadata {
wav_filename: "TEMPLATE_FOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 300
channel_ids: [0, 1, 2, 3]
channel_labels: ["A0", "A1", "A2", "A3"]
Expand Down
1 change: 1 addition & 0 deletions iamf/cli/textproto_templates/1OA_pcm24bit.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ mix_presentation_metadata {
audio_frame_metadata {
wav_filename: "TEMPLATE_FOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 300
Expand Down
6 changes: 4 additions & 2 deletions iamf/cli/textproto_templates/3OA_and_stereo_opus.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -177,17 +177,19 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_TOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 300
channel_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
channel_labels: ["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]
},
{
wav_filename: "TEMPLATE_stereo.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 301
channel_ids: [0, 1]
channel_labels: ["L2", "R2"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_TOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 300
Expand All @@ -182,6 +183,7 @@ audio_frame_metadata: [
{
wav_filename: "TEMPLATE_stereo.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 301
Expand Down
3 changes: 2 additions & 1 deletion iamf/cli/textproto_templates/3OA_opus.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,9 @@ mix_presentation_metadata {
audio_frame_metadata {
wav_filename: "TEMPLATE_TOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 300
channel_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
channel_labels: ["A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15"]
Expand Down
1 change: 1 addition & 0 deletions iamf/cli/textproto_templates/3OA_pcm24bit.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ mix_presentation_metadata {
audio_frame_metadata {
wav_filename: "TEMPLATE_TOA.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 0
audio_element_id: 300
Expand Down
3 changes: 2 additions & 1 deletion iamf/cli/textproto_templates/5dot1_opus.textproto
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,9 @@ mix_presentation_metadata {
audio_frame_metadata {
wav_filename: "TEMPLATE_5dot1.wav"
samples_to_trim_at_end_includes_padding: false
samples_to_trim_at_start_includes_codec_delay: false
samples_to_trim_at_end: 0
samples_to_trim_at_start: 312
samples_to_trim_at_start: 0
audio_element_id: 300
channel_ids: [0, 1, 2, 3, 4, 5]
channel_labels: ["L5", "R5", "C", "LFE", "Ls5", "Rs5"]
Expand Down
Loading

0 comments on commit f4e6706

Please sign in to comment.