Use ffmpeg parser for H.264 (#431)

Fixes a number of things including a LPMS crash, choppy video quality, green screens during rotation, inconsistent frame counts vs software decoding, etc. We also apparently gained GPU support for MPEG2 decoding. This is a massive change: we can no longer add outputs up front due to the ffmpeg hwaccel API, so we have to wait until we receive a decoded video frame in order to add outputs. This also means properly queuing up audio and draining things in the same order.
livepeer · Jan 18, 2025 · 79e6dcf · 79e6dcf
1 parent 25cbb36
commit 79e6dcf
Show file tree

Hide file tree

Showing 12 changed files with 256 additions and 149 deletions.
diff --git a/data/bad-cuvid.ts b/data/bad-cuvid.ts
diff --git a/data/broken-h264-parser.ts b/data/broken-h264-parser.ts
diff --git a/ffmpeg/decoder.c b/ffmpeg/decoder.c
@@ -188,38 +188,17 @@ enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx)
   return AV_PIX_FMT_NONE;
 }
 
-/**
- * Callback to negotiate the pixel format for AVCodecContext.
- */
-static enum AVPixelFormat get_hw_pixfmt(AVCodecContext *vc, const enum AVPixelFormat *pix_fmts)
+static enum AVPixelFormat get_hw_format(AVCodecContext *ctx,
+                                        const enum AVPixelFormat *pix_fmts)
 {
-  AVHWFramesContext *frames;
-  int ret = 0;
+  const enum AVPixelFormat *p;
+  const enum AVPixelFormat hw_pix_fmt = hw2pixfmt(ctx);
+
+  for (p = pix_fmts; *p != -1; p++) {
+    if (*p == hw_pix_fmt) return *p;
+  }
 
-  // XXX Ideally this would be auto initialized by the HW device ctx
-  //     However the initialization doesn't occur in time to set up filters
-  //     So we do it here. Also see avcodec_get_hw_frames_parameters
-  av_buffer_unref(&vc->hw_frames_ctx);
-  vc->hw_frames_ctx = av_hwframe_ctx_alloc(vc->hw_device_ctx);
-  if (!vc->hw_frames_ctx) LPMS_ERR(pixfmt_cleanup, "Unable to allocate hwframe context for decoding");
-
-  frames = (AVHWFramesContext*)vc->hw_frames_ctx->data;
-  frames->format = hw2pixfmt(vc);
-  frames->sw_format = vc->sw_pix_fmt;
-  frames->width = vc->width;
-  frames->height = vc->height;
-
-  // May want to allocate extra HW frames if we encounter samples where
-  // the defaults are insufficient. Raising this increases GPU memory usage
-  // For now, the defaults seems OK.
-  //vc->extra_hw_frames = 16 + 1; // H.264 max refs
-
-  ret = av_hwframe_ctx_init(vc->hw_frames_ctx);
-  if (AVERROR(ENOSYS) == ret) ret = lpms_ERR_INPUT_PIXFMT; // most likely
-  if (ret < 0) LPMS_ERR(pixfmt_cleanup, "Unable to initialize a hardware frame pool");
-  return frames->format;
-
-pixfmt_cleanup:
+  fprintf(stderr, "Failed to get HW surface format.\n");
   return AV_PIX_FMT_NONE;
 }
 
@@ -253,38 +232,6 @@ int open_audio_decoder(input_params *params, struct input_ctx *ctx)
   return ret;
 }
 
-char* get_hw_decoder(int ff_codec_id, int hw_type)
-{
-    switch (hw_type) {
-        case AV_HWDEVICE_TYPE_CUDA:
-            switch (ff_codec_id) {
-                case AV_CODEC_ID_H264:
-                    return "h264_cuvid";
-                case AV_CODEC_ID_HEVC:
-                    return "hevc_cuvid";
-                case AV_CODEC_ID_VP8:
-                    return "vp8_cuvid";
-                case AV_CODEC_ID_VP9:
-                    return "vp9_cuvid";
-                default:
-                    return "";
-            }
-        case AV_HWDEVICE_TYPE_MEDIACODEC:
-            switch (ff_codec_id) {
-                case AV_CODEC_ID_H264:
-                    return "h264_ni_dec";
-                case AV_CODEC_ID_HEVC:
-                    return "h265_ni_dec";
-                case AV_CODEC_ID_VP8:
-                    return "";
-                case AV_CODEC_ID_VP9:
-                    return "";
-                default:
-                    return "";
-            }
-    }
-}
-
 int open_video_decoder(input_params *params, struct input_ctx *ctx)
 {
   int ret = 0;
@@ -298,14 +245,6 @@ int open_video_decoder(input_params *params, struct input_ctx *ctx)
     LPMS_WARN("No video stream found in input");
   } else {
     if (params->hw_type > AV_HWDEVICE_TYPE_NONE) {
-      char* decoder_name = get_hw_decoder(codec->id, params->hw_type);
-      if (!*decoder_name) {
-        ret = lpms_ERR_INPUT_CODEC;
-        LPMS_ERR(open_decoder_err, "Input codec does not support hardware acceleration");
-      }
-      const AVCodec *c = avcodec_find_decoder_by_name(decoder_name);
-      if (c) codec = c;
-      else LPMS_WARN("Nvidia decoder not found; defaulting to software");
       if (AV_PIX_FMT_YUV420P != ic->streams[ctx->vi]->codecpar->format &&
           AV_PIX_FMT_YUVJ420P != ic->streams[ctx->vi]->codecpar->format) {
         // TODO check whether the color range is truncated if yuvj420p is used
@@ -330,13 +269,19 @@ int open_video_decoder(input_params *params, struct input_ctx *ctx)
       ret = av_hwdevice_ctx_create(&ctx->hw_device_ctx, params->hw_type, params->device, NULL, 0);
       if (ret < 0) LPMS_ERR(open_decoder_err, "Unable to open hardware context for decoding")
       vc->hw_device_ctx = av_buffer_ref(ctx->hw_device_ctx);
-      vc->get_format = get_hw_pixfmt;
+      vc->get_format = get_hw_format;
     }
     ctx->hw_type = params->hw_type;
     vc->pkt_timebase = ic->streams[ctx->vi]->time_base;
     av_opt_set(vc->priv_data, "xcoder-params", ctx->xcoderParams, 0);
     ret = avcodec_open2(vc, codec, opts);
     if (ret < 0) LPMS_ERR(open_decoder_err, "Unable to open video decoder");
+    if (params->hw_type > AV_HWDEVICE_TYPE_NONE) {
+      if (AV_PIX_FMT_NONE == hw2pixfmt(vc)) {
+        ret = lpms_ERR_INPUT_CODEC;
+        LPMS_ERR(open_decoder_err, "Input codec does not support hardware acceleration");
+      }
+    }
   }
 
   return 0;

diff --git a/ffmpeg/decoder.h b/ffmpeg/decoder.h
@@ -66,7 +66,6 @@ enum AVPixelFormat hw2pixfmt(AVCodecContext *ctx);
 int open_input(input_params *params, struct input_ctx *ctx);
 int open_video_decoder(input_params *params, struct input_ctx *ctx);
 int open_audio_decoder(input_params *params, struct input_ctx *ctx);
-char* get_hw_decoder(int ff_codec_id, int hw_type);
 void free_input(struct input_ctx *inctx);
 
 // Utility functions

diff --git a/ffmpeg/encoder.c b/ffmpeg/encoder.c
@@ -224,7 +224,7 @@ int open_output(struct output_ctx *octx, struct input_ctx *ictx)
 
   // add video encoder if a decoder exists and this output requires one
   if (ictx->vc && needs_decoder(octx->video->name)) {
-    ret = init_video_filters(ictx, octx);
+    ret = init_video_filters(ictx, octx, NULL);
     if (ret < 0) LPMS_ERR(open_output_err, "Unable to open video filter");
 
     codec = avcodec_find_encoder_by_name(octx->video->name);
@@ -296,6 +296,8 @@ int open_output(struct output_ctx *octx, struct input_ctx *ictx)
     if (ret < 0) LPMS_ERR(open_output_err, "Unable to open signature filter");
   }
 
+  octx->initialized = 1;
+
   return 0;
 
 open_output_err:
@@ -521,7 +523,7 @@ int mux(AVPacket *pkt, AVRational tb, struct output_ctx *octx, AVStream *ost)
 static int calc_signature(AVFrame *inf, struct output_ctx *octx)
 {
   int ret = 0;
-  if (inf->hw_frames_ctx && octx->sf.hwframes && inf->hw_frames_ctx->data != octx->sf.hwframes) {
+  if (inf->hw_frames_ctx && octx->sf.hw_frames_ctx && inf->hw_frames_ctx->data != octx->sf.hw_frames_ctx->data) {
       free_filter(&octx->sf);
       ret = init_signature_filters(octx, inf);
       if (ret < 0) return lpms_ERR_FILTERS;

diff --git a/ffmpeg/ffmpeg_test.go b/ffmpeg/ffmpeg_test.go
@@ -2218,23 +2218,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		`
 	}
 
-	// TODO figure out why cpu/gpu are different
-	if accel == Nvidia {
-		cmd = cmd + `
-			cat <<-EOF1 > expected.dims
-				115 256,144
-				120 146,260
-				125 256,144
-			EOF1
-
-			cat <<-EOF2 > expected-30fps.dims
-				58 256,144
-				60 146,260
-				63 256,144
-			EOF2
-		`
-	} else {
-		cmd = cmd + `
+	cmd = cmd + `
 			cat <<-EOF1 > expected.dims
 				120 256,144
 				120 146,260
@@ -2246,10 +2230,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 				60 146,260
 				61 256,144
 			EOF2
-		`
-	}
 
-	cmd = cmd + `
 		diff -u expected.dims out.dims
 		diff -u expected-30fps.dims out-30fps.dims
 	`
@@ -2299,9 +2280,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 	}})
 	require.NoError(t, err)
 
-	// TODO figure out why nvidia is different; green screen?
-	if accel == Software {
-		cmd = `
+	cmd = `
 		cat out-test-0.ts  out-transposed.ts out-test-2.ts > out-test-concat.ts
 		ffprobe -show_entries frame=pts,pkt_dts,duration,pict_type,width,height -of csv out-test-concat.ts > out-test-concat.framedata
 
@@ -2317,8 +2296,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		# this does not line up
 		#diff -u out-test-concat-30fps.framedata out-double-rotated-30fps.framedata
 	`
-		run(cmd)
-	}
+	run(cmd)
 
 	// check single rotations
 	res, err = Transcode3(
@@ -2344,21 +2322,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 		ffprobe -show_entries frame=height,width -of csv=p=0 out-single-rotated-30fps.ts | sed 's/,$//g' | uniq -c | sed 's/^ *//g' > single-out-30fps.dims
 	`
 
-	// TODO figure out why cpu/gpu are different
-	if accel == Nvidia {
-		cmd = cmd + `
-			cat <<-EOF1 > single-expected.dims
-				115 256,144
-				125 146,260
-			EOF1
-
-			cat <<-EOF2 > single-expected-30fps.dims
-				58 256,144
-				63 146,260
-			EOF2
-		`
-	} else {
-		cmd = cmd + `
+	cmd = cmd + `
 			cat <<-EOF1 > single-expected.dims
 				120 256,144
 				120 146,260
@@ -2368,10 +2332,7 @@ func runRotationTests(t *testing.T, accel Acceleration) {
 				60 256,144
 				61 146,260
 			EOF2
-		`
-	}
 
-	cmd = cmd + `
 		diff -u single-expected.dims single-out.dims
 		diff -u single-expected-30fps.dims single-out-30fps.dims
 	`

diff --git a/ffmpeg/filter.c b/ffmpeg/filter.c
@@ -47,7 +47,7 @@ int filtergraph_parser(struct filter_ctx *fctx, char* filters_descr, AVFilterInO
   return ret;
 }
 
-int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
+int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx, AVFrame *inf)
 {
     char args[512];
     int ret = 0;
@@ -92,8 +92,9 @@ int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx)
     if (ictx->vc && ictx->vc->hw_frames_ctx) {
       // XXX a bit problematic in that it's set before decoder is fully ready
       AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
-      srcpar->hw_frames_ctx = ictx->vc->hw_frames_ctx;
-      vf->hwframes = ictx->vc->hw_frames_ctx->data;
+      AVBufferRef *hw_frames_ctx = inf && inf->hw_frames_ctx ? inf->hw_frames_ctx : ictx->vc->hw_frames_ctx;
+      srcpar->hw_frames_ctx = hw_frames_ctx;
+      av_buffer_replace(&vf->hw_frames_ctx, hw_frames_ctx);
       av_buffersrc_parameters_set(vf->src_ctx, srcpar);
       av_freep(&srcpar);
     }
@@ -243,13 +244,13 @@ int init_signature_filters(struct output_ctx *octx, AVFrame *inf)
     if (octx->vc && inf && inf->hw_frames_ctx) {
       AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();      
       srcpar->hw_frames_ctx = inf->hw_frames_ctx;
-      sf->hwframes = inf->hw_frames_ctx->data;
+      av_buffer_replace(&sf->hw_frames_ctx, inf->hw_frames_ctx);
       av_buffersrc_parameters_set(sf->src_ctx, srcpar);
       av_freep(&srcpar);
     } else if (octx->vc && octx->vc->hw_frames_ctx) {
       AVBufferSrcParameters *srcpar = av_buffersrc_parameters_alloc();
       srcpar->hw_frames_ctx = octx->vc->hw_frames_ctx;
-      sf->hwframes = octx->vc->hw_frames_ctx->data;
+      av_buffer_replace(&sf->hw_frames_ctx, octx->vc->hw_frames_ctx);
       av_buffersrc_parameters_set(sf->src_ctx, srcpar);
       av_freep(&srcpar);
     }
@@ -288,8 +289,8 @@ int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *o
   // before the decoder is fully ready, and the decoder may change HW params
   // XXX: Unclear if this path is hit on all devices
   if (is_video && inf && (
-      (inf->hw_frames_ctx && filter->hwframes &&
-        inf->hw_frames_ctx->data != filter->hwframes) ||
+      (inf->hw_frames_ctx && filter->hw_frames_ctx &&
+        inf->hw_frames_ctx->data != filter->hw_frames_ctx->data) ||
       (filter->src_ctx->nb_outputs > 0 &&
         filter->src_ctx->outputs[0]->w != inf->width &&
         filter->src_ctx->outputs[0]->h != inf->height))) {
@@ -326,7 +327,7 @@ int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *o
     ret = 0;
 
     free_filter(&octx->vf);
-    ret = init_video_filters(ictx, octx);
+    ret = init_video_filters(ictx, octx, inf);
     if (ret < 0) return lpms_ERR_FILTERS;
   }
 
@@ -411,5 +412,6 @@ void free_filter(struct filter_ctx *filter)
 {
   if (filter->frame) av_frame_free(&filter->frame);
   if (filter->graph) avfilter_graph_free(&filter->graph);
+  if (filter->hw_frames_ctx) av_buffer_unref(&filter->hw_frames_ctx);
   memset(filter, 0, sizeof(struct filter_ctx));
 }
diff --git a/ffmpeg/filter.h b/ffmpeg/filter.h
@@ -11,7 +11,7 @@ struct filter_ctx {
   AVFilterContext *sink_ctx;
   AVFilterContext *src_ctx;
 
-  uint8_t *hwframes; // GPU frame pool data
+  AVBufferRef *hw_frames_ctx; // GPU frame pool data
 
   // Input timebase for this filter
   AVRational time_base;
@@ -46,6 +46,7 @@ struct filter_ctx {
 };
 
 struct output_ctx {
+  int initialized;     // whether this output is ready
   char *fname;         // required output file name
   char *vfilters;      // required output video filters
   char *sfilters;      // required output signature filters
@@ -82,7 +83,7 @@ struct output_ctx {
   char *xcoderParams;
 };
 
-int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx);
+int init_video_filters(struct input_ctx *ictx, struct output_ctx *octx, AVFrame *inf);
 int init_audio_filters(struct input_ctx *ictx, struct output_ctx *octx);
 int init_signature_filters(struct output_ctx *octx, AVFrame *inf);
 int filtergraph_write(AVFrame *inf, struct input_ctx *ictx, struct output_ctx *octx, struct filter_ctx *filter, int is_video);