From 44ad28ebf4a17779e483971e80c260c74e972244 Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Tue, 10 Jan 2023 13:52:15 -0600
Subject: [PATCH] Fix a reference leak of hw_frames_ctx and prepare for QSV
 (#736)

---
 src/platform/common.h                   |  9 ++-
 src/platform/linux/cuda.cpp             | 17 ++---
 src/platform/linux/vaapi.cpp            | 11 ++--
 src/platform/macos/nv12_zero_device.cpp |  2 +-
 src/platform/macos/nv12_zero_device.h   |  2 +-
 src/platform/windows/display_vram.cpp   | 87 +++++++++++++++----------
 src/video.cpp                           | 33 +++++-----
 7 files changed, 92 insertions(+), 69 deletions(-)

diff --git a/src/platform/common.h b/src/platform/common.h
index fe074c5b..468df863 100644
--- a/src/platform/common.h
+++ b/src/platform/common.h
@@ -17,6 +17,8 @@
 
 struct sockaddr;
 struct AVFrame;
+struct AVBufferRef;
+struct AVHWFramesContext;
 
 // Forward declarations of boost classes to avoid having to include boost headers
 // here, which results in issues with Windows.h and WinSock2.h include order.
@@ -196,13 +198,18 @@ struct hwdevice_t {
   /**
    * implementations must take ownership of 'frame'
    */
-  virtual int set_frame(AVFrame *frame) {
+  virtual int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) {
     BOOST_LOG(error) << "Illegal call to hwdevice_t::set_frame(). Did you forget to override it?";
     return -1;
   };
 
   virtual void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {};
 
+  /**
+   * Implementations may set parameters during initialization of the hwframes context
+   */
+  virtual void init_hwframes(AVHWFramesContext *frames) {};
+
   virtual ~hwdevice_t() = default;
 };
 
diff --git a/src/platform/linux/cuda.cpp b/src/platform/linux/cuda.cpp
index 963d7c35..9e3c95b4 100644
--- a/src/platform/linux/cuda.cpp
+++ b/src/platform/linux/cuda.cpp
@@ -94,20 +94,21 @@ public:
     return 0;
   }
 
-  int set_frame(AVFrame *frame) override {
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) override {
     this->hwframe.reset(frame);
     this->frame = frame;
 
-    auto hwframe_ctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+    auto hwframe_ctx = (AVHWFramesContext *)hw_frames_ctx->data;
     if(hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
       BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
       return -1;
     }
 
-    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
-      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
-
-      return -1;
+    if(!frame->buf[0]) {
+      if(av_hwframe_get_buffer(hw_frames_ctx, frame, 0)) {
+        BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+        return -1;
+      }
     }
 
     auto cuda_ctx = (AVCUDADeviceContext *)hwframe_ctx->device_ctx->hwctx;
@@ -180,8 +181,8 @@ public:
     return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
   }
 
-  int set_frame(AVFrame *frame) {
-    if(cuda_t::set_frame(frame)) {
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) {
+    if(cuda_t::set_frame(frame, hw_frames_ctx)) {
       return -1;
     }
 
diff --git a/src/platform/linux/vaapi.cpp b/src/platform/linux/vaapi.cpp
index 9dcb2dfe..07f0d323 100644
--- a/src/platform/linux/vaapi.cpp
+++ b/src/platform/linux/vaapi.cpp
@@ -313,14 +313,15 @@ public:
     return 0;
   }
 
-  int set_frame(AVFrame *frame) override {
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) override {
     this->hwframe.reset(frame);
     this->frame = frame;
 
-    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
-      BOOST_LOG(error) << "Couldn't get hwframe for VAAPI"sv;
-
-      return -1;
+    if(!frame->buf[0]) {
+      if(av_hwframe_get_buffer(hw_frames_ctx, frame, 0)) {
+        BOOST_LOG(error) << "Couldn't get hwframe for VAAPI"sv;
+        return -1;
+      }
     }
 
     va::DRMPRIMESurfaceDescriptor prime;
diff --git a/src/platform/macos/nv12_zero_device.cpp b/src/platform/macos/nv12_zero_device.cpp
index 1af0e058..71e58307 100644
--- a/src/platform/macos/nv12_zero_device.cpp
+++ b/src/platform/macos/nv12_zero_device.cpp
@@ -53,7 +53,7 @@ int nv12_zero_device::convert(platf::img_t &img) {
   return result > 0 ? 0 : -1;
 }
 
-int nv12_zero_device::set_frame(AVFrame *frame) {
+int nv12_zero_device::set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) {
   this->frame = frame;
 
   av_frame.reset(frame);
diff --git a/src/platform/macos/nv12_zero_device.h b/src/platform/macos/nv12_zero_device.h
index 3b74ebcc..1863fb0f 100644
--- a/src/platform/macos/nv12_zero_device.h
+++ b/src/platform/macos/nv12_zero_device.h
@@ -20,7 +20,7 @@ public:
   int init(void *display, resolution_fn_t resolution_fn, pixel_format_fn_t pixel_format_fn);
 
   int convert(img_t &img);
-  int set_frame(AVFrame *frame);
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx);
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 };
 
diff --git a/src/platform/windows/display_vram.cpp b/src/platform/windows/display_vram.cpp
index 4c0e8fa4..72ad6b7b 100644
--- a/src/platform/windows/display_vram.cpp
+++ b/src/platform/windows/display_vram.cpp
@@ -392,17 +392,63 @@ public:
     this->color_matrix = std::move(color_matrix);
   }
 
-  int set_frame(AVFrame *frame) {
+  void init_hwframes(AVHWFramesContext *frames) override {
+    // We may be called with a QSV or D3D11VA context
+    if(frames->device_ctx->type == AV_HWDEVICE_TYPE_D3D11VA) {
+      auto d3d11_frames = (AVD3D11VAFramesContext *)frames->hwctx;
+
+      // The encoder requires textures with D3D11_BIND_RENDER_TARGET set
+      d3d11_frames->BindFlags = D3D11_BIND_RENDER_TARGET;
+      d3d11_frames->MiscFlags = 0;
+    }
+
+    // We require a single texture
+    frames->initial_pool_size = 1;
+  }
+
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) override {
     this->hwframe.reset(frame);
     this->frame = frame;
 
+    // Populate this frame with a hardware buffer if one isn't there already
+    if(!frame->buf[0]) {
+      auto err = av_hwframe_get_buffer(hw_frames_ctx, frame, 0);
+      if(err) {
+        char err_str[AV_ERROR_MAX_STRING_SIZE] { 0 };
+        BOOST_LOG(error) << "Failed to get hwframe buffer: "sv << av_make_error_string(err_str, AV_ERROR_MAX_STRING_SIZE, err);
+        return -1;
+      }
+    }
+
+    // If this is a frame from a derived context, we'll need to map it to D3D11
+    ID3D11Texture2D *frame_texture;
+    if(frame->format != AV_PIX_FMT_D3D11) {
+      frame_t d3d11_frame { av_frame_alloc() };
+
+      d3d11_frame->format = AV_PIX_FMT_D3D11;
+
+      auto err = av_hwframe_map(d3d11_frame.get(), frame, AV_HWFRAME_MAP_WRITE | AV_HWFRAME_MAP_OVERWRITE);
+      if(err) {
+        char err_str[AV_ERROR_MAX_STRING_SIZE] { 0 };
+        BOOST_LOG(error) << "Failed to map D3D11 frame: "sv << av_make_error_string(err_str, AV_ERROR_MAX_STRING_SIZE, err);
+        return -1;
+      }
+
+      // Get the texture from the mapped frame
+      frame_texture = (ID3D11Texture2D *)d3d11_frame->data[0];
+    }
+    else {
+      // Otherwise, we can just use the texture inside the original frame
+      frame_texture = (ID3D11Texture2D *)frame->data[0];
+    }
+
     auto out_width  = frame->width;
     auto out_height = frame->height;
 
     float in_width  = img.display->width;
     float in_height = img.display->height;
 
-    // // Ensure aspect ratio is maintained
+    // Ensure aspect ratio is maintained
     auto scalar       = std::fminf(out_width / in_width, out_height / in_height);
     auto out_width_f  = in_width * scalar;
     auto out_height_f = in_height * scalar;
@@ -414,21 +460,9 @@ public:
     outY_view  = D3D11_VIEWPORT { offsetX, offsetY, out_width_f, out_height_f, 0.0f, 1.0f };
     outUV_view = D3D11_VIEWPORT { offsetX / 2, offsetY / 2, out_width_f / 2, out_height_f / 2, 0.0f, 1.0f };
 
-    D3D11_TEXTURE2D_DESC t {};
-    t.Width            = out_width;
-    t.Height           = out_height;
-    t.MipLevels        = 1;
-    t.ArraySize        = 1;
-    t.SampleDesc.Count = 1;
-    t.Usage            = D3D11_USAGE_DEFAULT;
-    t.Format           = format;
-    t.BindFlags        = D3D11_BIND_RENDER_TARGET;
-
-    auto status = device->CreateTexture2D(&t, nullptr, &img.encoder_texture);
-    if(FAILED(status)) {
-      BOOST_LOG(error) << "Failed to create render target texture [0x"sv << util::hex(status).to_string_view() << ']';
-      return -1;
-    }
+    // The underlying frame pool owns the texture, so we must reference it for ourselves
+    frame_texture->AddRef();
+    img.encoder_texture.reset(frame_texture);
 
     img.width       = out_width;
     img.height      = out_height;
@@ -449,7 +483,7 @@ public:
       D3D11_RTV_DIMENSION_TEXTURE2D
     };
 
-    status = device->CreateRenderTargetView(img.encoder_texture.get(), &nv12_rt_desc, &nv12_Y_rt);
+    auto status = device->CreateRenderTargetView(img.encoder_texture.get(), &nv12_rt_desc, &nv12_Y_rt);
     if(FAILED(status)) {
       BOOST_LOG(error) << "Failed to create render target view [0x"sv << util::hex(status).to_string_view() << ']';
       return -1;
@@ -463,23 +497,6 @@ public:
       return -1;
     }
 
-    // Need to have something refcounted
-    if(!frame->buf[0]) {
-      frame->buf[0] = av_buffer_allocz(sizeof(AVD3D11FrameDescriptor));
-    }
-
-    auto desc     = (AVD3D11FrameDescriptor *)frame->buf[0]->data;
-    desc->texture = (ID3D11Texture2D *)img.data;
-    desc->index   = 0;
-
-    frame->data[0] = img.data;
-    frame->data[1] = 0;
-
-    frame->linesize[0] = img.row_pitch;
-
-    frame->height = img.height;
-    frame->width  = img.width;
-
     return 0;
   }
 
diff --git a/src/video.cpp b/src/video.cpp
index 1fd50e7e..cd13b0d2 100644
--- a/src/video.cpp
+++ b/src/video.cpp
@@ -71,7 +71,7 @@ util::Either<buffer_t, int> dxgi_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_c
 util::Either<buffer_t, int> vaapi_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_ctx);
 util::Either<buffer_t, int> cuda_make_hwdevice_ctx(platf::hwdevice_t *hwdevice_ctx);
 
-int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format);
+int hwframe_ctx(ctx_t &ctx, platf::hwdevice_t *hwdevice, buffer_t &hwdevice_ctx, AVPixelFormat format);
 
 class swdevice_t : public platf::hwdevice_t {
 public:
@@ -116,17 +116,16 @@ public:
     return 0;
   }
 
-  int set_frame(AVFrame *frame) {
+  int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) {
     this->frame = frame;
 
     // If it's a hwframe, allocate buffers for hardware
-    if(frame->hw_frames_ctx) {
+    if(hw_frames_ctx) {
       hw_frame.reset(frame);
 
-      if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) return -1;
+      if(av_hwframe_get_buffer(hw_frames_ctx, frame, 0)) return -1;
     }
-
-    if(!frame->hw_frames_ctx) {
+    else {
       sw_frame.reset(frame);
     }
 
@@ -181,9 +180,9 @@ public:
     return 0;
   }
 
-  int init(int in_width, int in_height, AVFrame *frame, AVPixelFormat format) {
+  int init(int in_width, int in_height, AVFrame *frame, AVPixelFormat format, bool hardware) {
     // If the device used is hardware, yet the image resides on main memory
-    if(frame->hw_frames_ctx) {
+    if(hardware) {
       sw_frame.reset(av_frame_alloc());
 
       sw_frame->width  = frame->width;
@@ -981,7 +980,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     }
 
     hwdevice_ctx = std::move(buf_or_error.left());
-    if(hwframe_ctx(ctx, hwdevice_ctx, sw_fmt)) {
+    if(hwframe_ctx(ctx, hwdevice.get(), hwdevice_ctx, sw_fmt)) {
       return std::nullopt;
     }
 
@@ -1063,17 +1062,12 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
   frame->width  = ctx->width;
   frame->height = ctx->height;
 
-
-  if(hardware) {
-    frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
-  }
-
   std::shared_ptr<platf::hwdevice_t> device;
 
   if(!hwdevice->data) {
     auto device_tmp = std::make_unique<swdevice_t>();
 
-    if(device_tmp->init(width, height, frame.get(), sw_fmt)) {
+    if(device_tmp->init(width, height, frame.get(), sw_fmt, hardware)) {
       return std::nullopt;
     }
 
@@ -1083,7 +1077,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     device = std::move(hwdevice);
   }
 
-  if(device->set_frame(frame.release())) {
+  if(device->set_frame(frame.release(), ctx->hw_frames_ctx)) {
     return std::nullopt;
   }
 
@@ -1812,8 +1806,8 @@ int init() {
   return 0;
 }
 
-int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format) {
-  buffer_t frame_ref { av_hwframe_ctx_alloc(hwdevice.get()) };
+int hwframe_ctx(ctx_t &ctx, platf::hwdevice_t *hwdevice, buffer_t &hwdevice_ctx, AVPixelFormat format) {
+  buffer_t frame_ref { av_hwframe_ctx_alloc(hwdevice_ctx.get()) };
 
   auto frame_ctx               = (AVHWFramesContext *)frame_ref->data;
   frame_ctx->format            = ctx->pix_fmt;
@@ -1822,6 +1816,9 @@ int hwframe_ctx(ctx_t &ctx, buffer_t &hwdevice, AVPixelFormat format) {
   frame_ctx->width             = ctx->width;
   frame_ctx->initial_pool_size = 0;
 
+  // Allow the hwdevice to modify hwframe context parameters
+  hwdevice->init_hwframes(frame_ctx);
+
   if(auto err = av_hwframe_ctx_init(frame_ref.get()); err < 0) {
     return err;
   }