From bb65eb62991e5165b9fad2702a8954a1fe3c6f1f Mon Sep 17 00:00:00 2001
From: Ben Chang <benc@nvidia.com>
Date: Sat, 24 Jun 2017 12:10:10 +0000
Subject: [PATCH 1/3] nvenc: Add an explicit auto alias

---
 libavcodec/nvenc_h264.c | 1 +
 libavcodec/nvenc_hevc.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
index f7e5cd05dc..1c9e07a776 100644
--- a/libavcodec/nvenc_h264.c
+++ b/libavcodec/nvenc_h264.c
@@ -47,6 +47,7 @@ static const AVOption options[] = {
     { "high_444", "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_HIGH_444 },            0, 0, VE, "profile" },
     { "constrained_high", "",                             0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_H264_PROFILE_CONSTRAINED_HIGH },    0, 0, VE, "profile" },
     { "level",    "Set the encoding level restriction",   OFFSET(level),       AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_H264_51, VE, "level" },
+    { "auto",     "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
     { "1.0",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1 },  0, 0, VE,  "level" },
     { "1.b",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_1b }, 0, 0, VE,  "level" },
     { "1.1",      "",                                     0,                   AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_H264_11 }, 0, 0, VE,  "level" },
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
index 9102d8a3b3..a13db98356 100644
--- a/libavcodec/nvenc_hevc.c
+++ b/libavcodec/nvenc_hevc.c
@@ -47,6 +47,7 @@ static const AVOption options[] = {
     { "rext",   "",                                      0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_HEVC_PROFILE_REXT }, 0, 0, VE, "profile" },
 #endif /* NVENCAPI_MAJOR_VERSION >= 7 */
     { "level",   "Set the encoding level restriction",   OFFSET(level),        AV_OPT_TYPE_INT,    { .i64 = NV_ENC_LEVEL_AUTOSELECT }, NV_ENC_LEVEL_AUTOSELECT, NV_ENC_LEVEL_HEVC_62, VE, "level" },
+    { "auto",    "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_AUTOSELECT }, 0, 0, VE, "level" },
     { "1.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_1 },  0, 0, VE,  "level" },
     { "2.0",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_2 },  0, 0, VE,  "level" },
     { "2.1",     "",                                     0,                    AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_LEVEL_HEVC_21 }, 0, 0, VE,  "level" },

From 2e8d88ad5281ab923e1d9772eb99fdfb483435c2 Mon Sep 17 00:00:00 2001
From: Ben Chang <benc@nvidia.com>
Date: Sat, 24 Jun 2017 12:14:22 +0000
Subject: [PATCH 2/3] nvenc: Use a fifo to manage the free surface pool

Previously, if a session allocates x surfaces, only x-1 surfaces are
used (due to combination of output delay and lock toggle logic).
---
 libavcodec/nvenc.c | 23 ++++++++++++++---------
 libavcodec/nvenc.h |  3 +--
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index f16e509df3..7b30ad768e 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -986,6 +986,7 @@ static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
 {
     NVENCContext *ctx               = avctx->priv_data;
     NV_ENCODE_API_FUNCTION_LIST *nv = &ctx->nvel.nvenc_funcs;
+    NVENCFrame *tmp_surface         = &ctx->frames[idx];
     int ret;
     NV_ENC_CREATE_BITSTREAM_BUFFER out_buffer = { 0 };
 
@@ -1046,6 +1047,8 @@ static int nvenc_alloc_surface(AVCodecContext *avctx, int idx)
 
     ctx->frames[idx].out  = out_buffer.bitstreamBuffer;
 
+    av_fifo_generic_write(ctx->unused_surface_queue, &tmp_surface, sizeof(tmp_surface), NULL);
+
     return 0;
 }
 
@@ -1066,6 +1069,9 @@ static int nvenc_setup_surfaces(AVCodecContext *avctx)
     ctx->timestamps = av_fifo_alloc(ctx->nb_surfaces * sizeof(int64_t));
     if (!ctx->timestamps)
         return AVERROR(ENOMEM);
+    ctx->unused_surface_queue = av_fifo_alloc(ctx->nb_surfaces * sizeof(NVENCFrame*));
+    if (!ctx->unused_surface_queue)
+        return AVERROR(ENOMEM);
     ctx->pending = av_fifo_alloc(ctx->nb_surfaces * sizeof(*ctx->frames));
     if (!ctx->pending)
         return AVERROR(ENOMEM);
@@ -1123,6 +1129,7 @@ av_cold int ff_nvenc_encode_close(AVCodecContext *avctx)
     av_fifo_free(ctx->timestamps);
     av_fifo_free(ctx->pending);
     av_fifo_free(ctx->ready);
+    av_fifo_free(ctx->unused_surface_queue);
 
     if (ctx->frames) {
         for (i = 0; i < ctx->nb_surfaces; ++i) {
@@ -1201,16 +1208,14 @@ av_cold int ff_nvenc_encode_init(AVCodecContext *avctx)
 
 static NVENCFrame *get_free_frame(NVENCContext *ctx)
 {
-    int i;
+    NVENCFrame *tmp_surf;
 
-    for (i = 0; i < ctx->nb_surfaces; i++) {
-        if (!ctx->frames[i].locked) {
-            ctx->frames[i].locked = 1;
-            return &ctx->frames[i];
-        }
-    }
+    if (!(av_fifo_size(ctx->unused_surface_queue) > 0))
+        // queue empty
+        return NULL;
 
-    return NULL;
+    av_fifo_generic_read(ctx->unused_surface_queue, &tmp_surf, sizeof(tmp_surf), NULL);
+    return tmp_surf;
 }
 
 static int nvenc_copy_frame(NV_ENC_LOCK_INPUT_BUFFER *in, const AVFrame *frame)
@@ -1510,7 +1515,7 @@ static int nvenc_get_output(AVCodecContext *avctx, AVPacket *pkt)
         frame->in = NULL;
     }
 
-    frame->locked = 0;
+    av_fifo_generic_write(ctx->unused_surface_queue, &frame, sizeof(frame), NULL);
 
     ret = nvenc_set_timestamp(avctx, &params, pkt);
     if (ret < 0)
diff --git a/libavcodec/nvenc.h b/libavcodec/nvenc.h
index 3602f16e83..b42b930920 100644
--- a/libavcodec/nvenc.h
+++ b/libavcodec/nvenc.h
@@ -56,7 +56,6 @@ typedef struct NVENCFrame {
 
     NV_ENC_OUTPUT_PTR out;
     NV_ENC_BUFFER_FORMAT format;
-    int locked;
 } NVENCFrame;
 
 typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
@@ -145,7 +144,7 @@ typedef struct NVENCContext {
     int nb_surfaces;
     NVENCFrame *frames;
     AVFifoBuffer *timestamps;
-    AVFifoBuffer *pending, *ready;
+    AVFifoBuffer *pending, *ready, *unused_surface_queue;
 
     struct {
         CUdeviceptr ptr;

From 7cb053e4ddf258e2dbf52ccc586548680742d758 Mon Sep 17 00:00:00 2001
From: Ben Chang <benc@nvidia.com>
Date: Sat, 24 Jun 2017 12:17:14 +0000
Subject: [PATCH 3/3] nvenc: Minimize the surface allocation

The previous default sets the allocated surfaces to 32 unless it is
user-overridden or the lookahead parameter is set.

Change the surfaces calculation for default, B-frames and lookahead scenario.
---
 libavcodec/nvenc.c      | 45 ++++++++++++++++++++++++++++++++++++-----
 libavcodec/nvenc_h264.c |  4 ++--
 libavcodec/nvenc_hevc.c |  4 ++--
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index 7b30ad768e..884b344b96 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -874,6 +874,44 @@ static int nvenc_setup_codec_config(AVCodecContext *avctx)
     return 0;
 }
 
+static int nvenc_recalc_surfaces(AVCodecContext *avctx)
+{
+    NVENCContext *ctx = avctx->priv_data;
+    // default minimum of 4 surfaces
+    // multiply by 2 for number of NVENCs on gpu (hardcode to 2)
+    // another multiply by 2 to avoid blocking next PBB group
+    int nb_surfaces = FFMAX(4, ctx->config.frameIntervalP * 2 * 2);
+
+    // lookahead enabled
+    if (ctx->rc_lookahead > 0) {
+        // +1 is to account for lkd_bound calculation later
+        // +4 is to allow sufficient pipelining with lookahead
+        nb_surfaces = FFMAX(1, FFMAX(nb_surfaces, ctx->rc_lookahead + ctx->config.frameIntervalP + 1 + 4));
+        if (nb_surfaces > ctx->nb_surfaces && ctx->nb_surfaces > 0) {
+            av_log(avctx, AV_LOG_WARNING,
+                "Defined rc_lookahead requires more surfaces, "
+                "increasing used surfaces %d -> %d\n",
+                ctx->nb_surfaces, nb_surfaces);
+        }
+        ctx->nb_surfaces = FFMAX(nb_surfaces, ctx->nb_surfaces);
+    } else {
+        if (ctx->config.frameIntervalP > 1 &&
+            ctx->nb_surfaces < nb_surfaces && ctx->nb_surfaces > 0) {
+            av_log(avctx, AV_LOG_WARNING,
+                "Defined b-frame requires more surfaces, "
+                "increasing used surfaces %d -> %d\n",
+                ctx->nb_surfaces, nb_surfaces);
+            ctx->nb_surfaces = FFMAX(ctx->nb_surfaces, nb_surfaces);
+        } else if (ctx->nb_surfaces <= 0)
+            ctx->nb_surfaces = nb_surfaces;
+        // otherwise use user specified value
+    }
+
+    ctx->nb_surfaces = FFMAX(1, FFMIN(MAX_REGISTERED_FRAMES, ctx->nb_surfaces));
+    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
+    return 0;
+}
+
 static int nvenc_setup_encoder(AVCodecContext *avctx)
 {
     NVENCContext *ctx               = avctx->priv_data;
@@ -956,6 +994,8 @@ static int nvenc_setup_encoder(AVCodecContext *avctx)
     ctx->initial_pts[0] = AV_NOPTS_VALUE;
     ctx->initial_pts[1] = AV_NOPTS_VALUE;
 
+    nvenc_recalc_surfaces(avctx);
+
     nvenc_setup_rate_control(avctx);
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
@@ -1057,11 +1097,6 @@ static int nvenc_setup_surfaces(AVCodecContext *avctx)
     NVENCContext *ctx = avctx->priv_data;
     int i, ret;
 
-    ctx->nb_surfaces = FFMAX(4 + avctx->max_b_frames,
-                             ctx->nb_surfaces);
-    ctx->async_depth = FFMIN(ctx->async_depth, ctx->nb_surfaces - 1);
-
-
     ctx->frames = av_mallocz_array(ctx->nb_surfaces, sizeof(*ctx->frames));
     if (!ctx->frames)
         return AVERROR(ENOMEM);
diff --git a/libavcodec/nvenc_h264.c b/libavcodec/nvenc_h264.c
index 1c9e07a776..bf983265df 100644
--- a/libavcodec/nvenc_h264.c
+++ b/libavcodec/nvenc_h264.c
@@ -72,14 +72,14 @@ static const AVOption options[] = {
     { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
+    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 0 },                    0, MAX_REGISTERED_FRAMES, VE },
     { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
     { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
     { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
     { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
 #if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
     { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "b_adapt", "When lookahead is enabled, set this to 0 to disable adaptive B-frame decision", OFFSET(b_adapt), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, VE },
     { "spatial-aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
diff --git a/libavcodec/nvenc_hevc.c b/libavcodec/nvenc_hevc.c
index a13db98356..caf7c4add9 100644
--- a/libavcodec/nvenc_hevc.c
+++ b/libavcodec/nvenc_hevc.c
@@ -72,14 +72,14 @@ static const AVOption options[] = {
     { "ll_2pass_quality", "Multi-pass optimized for image quality (only for low-latency presets)",       0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_QUALITY },       0, 0, VE, "rc" },
     { "ll_2pass_size",    "Multi-pass optimized for constant frame size (only for low-latency presets)", 0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_FRAMESIZE_CAP }, 0, 0, VE, "rc" },
     { "vbr_2pass",        "Multi-pass variable bitrate mode",                                            0, AV_OPT_TYPE_CONST,  { .i64 = NV_ENC_PARAMS_RC_2_PASS_VBR },           0, 0, VE, "rc" },
-    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 32 },                   0, INT_MAX, VE },
+    { "surfaces", "Number of concurrent surfaces",        OFFSET(nb_surfaces), AV_OPT_TYPE_INT,    { .i64 = 0 },                    0, MAX_REGISTERED_FRAMES, VE },
     { "device",   "Select a specific NVENC device",       OFFSET(device),      AV_OPT_TYPE_INT,    { .i64 = -1 },                   -2, INT_MAX, VE, "device" },
     { "any",      "Pick the first device available",      0,                   AV_OPT_TYPE_CONST,  { .i64 = ANY_DEVICE },           0, 0, VE, "device" },
     { "list",     "List the available devices",           0,                   AV_OPT_TYPE_CONST,  { .i64 = LIST_DEVICES },         0, 0, VE, "device" },
     { "async_depth", "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
     { "delay",       "Delay frame output by the given amount of frames", OFFSET(async_depth), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
 #if NVENCAPI_MAJOR_VERSION >= 7
-    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, VE },
+    { "rc-lookahead", "Number of frames to look ahead for rate-control", OFFSET(rc_lookahead), AV_OPT_TYPE_INT, { .i64 = 0 }, -1, INT_MAX, VE },
     { "no-scenecut", "When lookahead is enabled, set this to 1 to disable adaptive I-frame insertion at scene cuts", OFFSET(no_scenecut), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "spatial_aq", "set to 1 to enable Spatial AQ", OFFSET(aq), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },
     { "zerolatency", "Set 1 to indicate zero latency operation (no reordering delay)", OFFSET(zerolatency), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, VE },