* Fix accuracy and latency issues with `FFmpegFrameGrabber.setVideoF…

…rameNumber()` (pull bytedeco#1734)
Zjaun · Jan 13, 2022 · 4269832 · 4269832
1 parent 97aab27
commit 4269832
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 
+ * Fix accuracy and latency issues with `FFmpegFrameGrabber.setVideoFrameNumber()` ([pull #1734](https://github.com/bytedeco/javacv/pull/1734))
  * Add new `Frame.pictType` field set to `I`, `P`, `B`, etc by `FFmpegFrameGrabber` ([pull #1730](https://github.com/bytedeco/javacv/pull/1730))
  * Set metadata for `AVFrame.opaque` in `FFmpegFrameGrabber` with call to `av_frame_copy_props()` ([issue #1729](https://github.com/bytedeco/javacv/issues/1729))
  * Add `charset` property to `FrameGrabber` and `FrameRecorder` to use for metadata from FFmpeg ([pull #1720](https://github.com/bytedeco/javacv/pull/1720))

diff --git a/src/main/java/org/bytedeco/javacv/FFmpegFrameGrabber.java b/src/main/java/org/bytedeco/javacv/FFmpegFrameGrabber.java
@@ -385,6 +385,7 @@ static class SeekCallback extends Seek_Pointer_long_int {
     private int             samples_channels, samples_format, samples_rate;
     private boolean         frameGrabbed;
     private Frame           frame;
+    private int[]           streams;
 
     private volatile boolean started = false;
 
@@ -605,23 +606,23 @@ public double getVideoFrameRate() {
     /** default override of super.setFrameNumber implies setting
      *  of a frame close to a video frame having that number */
     @Override public void setFrameNumber(int frameNumber) throws Exception {
-        if (hasVideo()) setTimestamp(Math.round(1000000L * frameNumber / getFrameRate()));
+        if (hasVideo()) setTimestamp((long)Math.floor(1000000L * frameNumber / getFrameRate()));
         else super.frameNumber = frameNumber;
     }
 
     /** if there is video stream tries to seek to video frame with corresponding timestamp
      *  otherwise sets super.frameNumber only because frameRate==0 if there is no video stream */
     public void setVideoFrameNumber(int frameNumber) throws Exception {
         // best guess, AVSEEK_FLAG_FRAME has not been implemented in FFmpeg...
-        if (hasVideo()) setVideoTimestamp(Math.round(1000000L * frameNumber / getFrameRate()));
+        if (hasVideo()) setVideoTimestamp((long)Math.floor(1000000L * frameNumber / getFrameRate()));
         else super.frameNumber = frameNumber;
     }
 
     /** if there is audio stream tries to seek to audio frame with corresponding timestamp
      *  ignoring otherwise */
     public void setAudioFrameNumber(int frameNumber) throws Exception {
         // best guess, AVSEEK_FLAG_FRAME has not been implemented in FFmpeg...
-        if (hasAudio()) setAudioTimestamp(Math.round(1000000L * frameNumber / getAudioFrameRate()));
+        if (hasAudio()) setAudioTimestamp((long)Math.floor(1000000L * frameNumber / getAudioFrameRate()));
 
     }
 
@@ -755,9 +756,14 @@ else if (frameTypesToSeek.contains(Frame.Type.AUDIO)) {
                     else if (seekFrame.samples != null && samples_frame != null && getSampleRate() > 0) {
                         frameDuration =  AV_TIME_BASE * samples_frame.nb_samples() / (double)getSampleRate();
                     }
+//                    if(frameDuration>0.0) {
+//                        maxSeekSteps = (long)(10*(timestamp - initialSeekPosition - frameDuration)/frameDuration);
+//                        if (maxSeekSteps<0) maxSeekSteps = 0;
+//                    }
                     if(frameDuration>0.0) {
-                        maxSeekSteps = (long)(10*(timestamp - initialSeekPosition - frameDuration)/frameDuration);
-                        if (maxSeekSteps<0) maxSeekSteps = 0;
+                        maxSeekSteps = 0; //no more grab if the distance to the requested timestamp is smaller than frameDuration
+                        if (timestamp - initialSeekPosition + 1 > frameDuration)  //allow for a rounding error
+                                  maxSeekSteps = (long)(10*(timestamp - initialSeekPosition)/frameDuration);
                     }
                     else if (initialSeekPosition < timestamp) maxSeekSteps = 1000;
 
@@ -768,7 +774,7 @@ else if (seekFrame.samples != null && samples_frame != null && getSampleRate() >
                         if (seekFrame == null) return; //is it better to throw NullPointerException?
 
                         count++;
-                        double ts=this.timestamp;
+                        double ts=seekFrame.timestamp;
                         frameDuration = 0.0;
                         if (seekFrame.image != null && this.getFrameRate() > 0)
                             frameDuration =  AV_TIME_BASE / (double)getFrameRate();
@@ -933,10 +939,12 @@ public synchronized void startUnsafe(boolean findStreamInfo) throws Exception {
         video_st = audio_st = null;
         AVCodecParameters video_par = null, audio_par = null;
         int nb_streams = oc.nb_streams();
+        streams = new int[nb_streams];
         for (int i = 0; i < nb_streams; i++) {
             AVStream st = oc.streams(i);
             // Get a pointer to the codec context for the video or audio stream
             AVCodecParameters par = st.codecpar();
+            streams[i] = par.codec_type();
             if (video_st == null && par.codec_type() == AVMEDIA_TYPE_VIDEO && (videoStream < 0 || videoStream == i)) {
                 video_st = st;
                 video_par = par;
@@ -1294,7 +1302,7 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
 
         if (oc == null || oc.isNull()) {
             throw new Exception("Could not grab: No AVFormatContext. (Has start() been called?)");
-        } else if ((!doVideo || video_st == null) && (!doAudio || audio_st == null)) {
+        } else if ((!doVideo || video_st == null) && (!doAudio || audio_st == null) && !doData) {
             return null;
         }
         if (!started) {
@@ -1303,19 +1311,8 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
 
         boolean videoFrameGrabbed = frameGrabbed && frame.image != null;
         boolean audioFrameGrabbed = frameGrabbed && frame.samples != null;
+        boolean dataFrameGrabbed = frameGrabbed && frame.data != null;
         frameGrabbed = false;
-        frame.keyFrame = false;
-        frame.imageWidth = 0;
-        frame.imageHeight = 0;
-        frame.imageDepth = 0;
-        frame.imageChannels = 0;
-        frame.imageStride = 0;
-        frame.image = null;
-        frame.sampleRate = 0;
-        frame.audioChannels = 0;
-        frame.samples = null;
-        frame.data = null;
-        frame.opaque = null;
         if (doVideo && videoFrameGrabbed) {
             if (doProcessing) {
                 processImage();
@@ -1328,7 +1325,24 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
             }
             frame.keyFrame = samples_frame.key_frame() != 0;
             return frame;
+        } else if (doData && dataFrameGrabbed) {
+            return frame;
         }
+
+        frame.keyFrame = false;
+        frame.imageWidth = 0;
+        frame.imageHeight = 0;
+        frame.imageDepth = 0;
+        frame.imageChannels = 0;
+        frame.imageStride = 0;
+        frame.image = null;
+        frame.sampleRate = 0;
+        frame.audioChannels = 0;
+        frame.samples = null;
+        frame.data = null;
+        frame.opaque = null;
+        frame.type = null;
+
         boolean done = false;
         boolean readPacket = pkt.stream_index() == -1;
         while (!done) {
@@ -1355,7 +1369,7 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
             frame.streamIndex = pkt.stream_index();
 
             // Is this a packet from the video stream?
-            if (doVideo && video_st != null && pkt.stream_index() == video_st.index()
+            if (doVideo && video_st != null && frame.streamIndex == video_st.index()
                     && (!keyFrames || pkt.flags() == AV_PKT_FLAG_KEY)) {
                 // Decode video frame
                 if (readPacket) {
@@ -1393,7 +1407,7 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
                         AVRational time_base = video_st.time_base();
                         timestamp = 1000000L * pts * time_base.num() / time_base.den();
                         // best guess, AVCodecContext.frame_number = number of decoded frames...
-                        frameNumber = (int)Math.round(timestamp * getFrameRate() / 1000000L);
+                        frameNumber = (int)Math.floor(timestamp * getFrameRate() / 1000000L);
                         frame.image = image_buf;
                         if (doProcessing) {
                             processImage();
@@ -1404,9 +1418,10 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
                         frame.timestamp = timestamp;
                         frame.keyFrame = picture.key_frame() != 0;
                         frame.pictType = (char)av_get_picture_type_char(picture.pict_type());
+                        frame.type = Frame.Type.VIDEO;
                     }
                 }
-            } else if (doAudio && audio_st != null && pkt.stream_index() == audio_st.index()) {
+            } else if (doAudio && audio_st != null && frame.streamIndex == audio_st.index()) {
                 // Decode audio frame
                 if (readPacket) {
                     ret = avcodec_send_packet(audio_c, pkt);
@@ -1440,15 +1455,24 @@ public synchronized Frame grabFrame(boolean doAudio, boolean doVideo, boolean do
                     done = true;
                     frame.timestamp = timestamp;
                     frame.keyFrame = samples_frame.key_frame() != 0;
+                    frame.type = Frame.Type.AUDIO;
                 }
-            } else if (doData) {
-                if (!readPacket) {
-                    readPacket = true;
-                    continue;
-                }
+            } else if (readPacket && doData
+                    && frame.streamIndex > -1 && frame.streamIndex < streams.length
+                    && streams[frame.streamIndex] != AVMEDIA_TYPE_VIDEO && streams[frame.streamIndex] != AVMEDIA_TYPE_AUDIO) {
                 // Export the stream byte data for non audio / video frames
                 frame.data = pkt.data().position(0).capacity(pkt.size()).asByteBuffer();
+                frame.opaque = pkt;
                 done = true;
+                switch (streams[frame.streamIndex]) {
+                    case AVMEDIA_TYPE_DATA: frame.type = Frame.Type.DATA; break;
+                    case AVMEDIA_TYPE_SUBTITLE: frame.type = Frame.Type.SUBTITLE; break;
+                    case AVMEDIA_TYPE_ATTACHMENT: frame.type = Frame.Type.ATTACHMENT; break;
+                    default: frame.type = null;
+                }
+            } else {
+                // Current packet is not needed (different stream index required)
+                readPacket = true;
             }
         }
         return frame;

diff --git a/src/main/java/org/bytedeco/javacv/FFmpegFrameRecorder.java b/src/main/java/org/bytedeco/javacv/FFmpegFrameRecorder.java
@@ -1302,7 +1302,7 @@ private boolean record(AVFrame frame) throws Exception {
 
     private void writePacket(int mediaType, AVPacket avPacket) throws Exception {
 
-        AVStream avStream = (mediaType == AVMEDIA_TYPE_VIDEO) ? audio_st : (mediaType == AVMEDIA_TYPE_AUDIO) ? video_st : null;
+        AVStream avStream = (mediaType == AVMEDIA_TYPE_VIDEO) ? video_st : (mediaType == AVMEDIA_TYPE_AUDIO) ? audio_st : null;
         String mediaTypeStr = (mediaType == AVMEDIA_TYPE_VIDEO) ? "video" : (mediaType == AVMEDIA_TYPE_AUDIO) ? "audio" : "unsupported media stream type";
 
         synchronized (oc) {

diff --git a/src/main/java/org/bytedeco/javacv/Frame.java b/src/main/java/org/bytedeco/javacv/Frame.java
@@ -75,11 +75,13 @@ public class Frame implements AutoCloseable, Indexable {
             DEPTH_FLOAT  =  32,
             DEPTH_DOUBLE =  64;
 
-    /** Constants defining data type in the frame*/
+    /** Constants defining data type in the frame. */
     public static enum Type {
         VIDEO,
         AUDIO,
-        DATA
+        DATA,
+        SUBTITLE,
+        ATTACHMENT
     }
 
     /** Information associated with the {@link #image} field. */
@@ -104,6 +106,9 @@ public static enum Type {
     /** Stream number the audio|video|other data is associated with. */
     public int streamIndex;
 
+    /** The type of the stream. */
+    public Type type;
+
     /** The underlying data object, for example, Pointer, AVFrame, IplImage, or Mat. */
     public Object opaque;
 
@@ -132,6 +137,7 @@ public Frame(int width, int height, int depth, int channels, int imageStride) {
         this.image = new Buffer[1];
         this.data = null;
         this.streamIndex = -1;
+        this.type = null;
 
         Pointer pointer = new BytePointer(imageHeight * imageStride * pixelSize(depth));
         ByteBuffer buffer = pointer.asByteBuffer();
@@ -222,6 +228,7 @@ public Frame clone() {
         newFrame.keyFrame = keyFrame;
         newFrame.pictType = pictType;
         newFrame.streamIndex = streamIndex;
+        newFrame.type = type;
         newFrame.opaque = new Pointer[3];
         if (image != null) {
             newFrame.image = new Buffer[image.length];