Use an actual cuda kernel to convert RGB to NV12

2025-08-10 00:52:16 +00:00 · 2021-09-19 20:40:34 +02:00
parent e3f642ac25
commit fed329568c
7 changed files with 4007 additions and 39 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,15 +4,6 @@ project(Sunshine)

 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

-add_subdirectory(third-party/Simple-Web-Server)
-
-set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
-set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
-set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
-set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
-add_subdirectory(third-party/miniupnp/miniupnpc)
-include_directories(third-party/miniupnp)
-
 if(WIN32)
 	# Ugly hack to compile with #include <qos2.h>
 	add_compile_definitions(
@@ -21,9 +12,20 @@ if(WIN32)
 		QOS_NON_ADAPTIVE_FLOW=2)
 endif()
 add_subdirectory(third-party/moonlight-common-c/enet)
+add_subdirectory(third-party/Simple-Web-Server)
+add_subdirectory(third-party/cbs)
+
+set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
+set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
+set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
+set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
+add_subdirectory(third-party/miniupnp/miniupnpc)
+include_directories(third-party/miniupnp)

 find_package(Threads REQUIRED)
 find_package(OpenSSL REQUIRED)
+set(Boost_USE_STATIC_LIBS ON)
+find_package(Boost COMPONENTS log filesystem REQUIRED)

 list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare)

@@ -106,6 +108,11 @@ else()
 	option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON)
 	option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON)

+	if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  	set(CMAKE_CUDA_ARCHITECTURES 75)
+	endif()
+	enable_language(CUDA)
+
 	if(${SUNSHINE_ENABLE_X11})
 		find_package(X11)
 	else()
@@ -188,6 +195,7 @@ else()
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
+		sunshine/platform/linux/cuda.cu
 		sunshine/platform/linux/cuda.cpp
 		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
@@ -203,7 +211,8 @@ else()
 		third-party/glad/include/EGL/eglplatform.h
 		third-party/glad/include/KHR/khrplatform.h
 		third-party/glad/include/glad/gl.h
-		third-party/glad/include/glad/egl.h)
+		third-party/glad/include/glad/egl.h
+		third-party/nvfbc/NvFBC.h)
 		
 	list(APPEND PLATFORM_LIBRARIES
 		dl
@@ -215,7 +224,8 @@ else()
 	include_directories(
 		/usr/include/libevdev-1.0
 		third-party/nv-codec-headers/include
-		third-party/glad/include)
+		third-party/glad/include
+		third-party/nvfbc)

 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
 		set(SUNSHINE_EXECUTABLE_PATH "sunshine")
@@ -224,11 +234,6 @@ else()
 	configure_file(sunshine.service.in sunshine.service @ONLY)
 endif()

-add_subdirectory(third-party/cbs)
-
-set(Boost_USE_STATIC_LIBS ON)
-find_package(Boost COMPONENTS log filesystem REQUIRED)
-
 set(SUNSHINE_TARGET_FILES
 	third-party/moonlight-common-c/reedsolomon/rs.c
 	third-party/moonlight-common-c/reedsolomon/rs.h
@@ -290,7 +295,7 @@ include_directories(

 string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE)
 if("${BUILD_TYPE}" STREQUAL "XDEBUG")
-	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -pedantic -ggdb3)
+	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3)
 	if(WIN32)
 		set_source_files_properties(sunshine/nvhttp.cpp PROPERTIES COMPILE_FLAGS -O2)
 	endif()
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,9 +1,4 @@
-#include "cuda.h"
-#include "graphics.h"
-#include "sunshine/main.h"
-#include "sunshine/utility.h"
-#include "wayland.h"
-#include "x11grab.h"
+#include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>

 extern "C" {
@@ -12,6 +7,13 @@ extern "C" {
 #include <libavutil/imgutils.h>
 }

+#include "cuda.h"
+#include "graphics.h"
+#include "sunshine/main.h"
+#include "sunshine/utility.h"
+#include "wayland.h"
+#include "x11grab.h"
+
 #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
 #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)

@@ -23,6 +25,13 @@ extern "C" {

 using namespace std::literals;
 namespace cuda {
+constexpr auto cudaDevAttrMaxThreadsPerBlock          = (CUdevice_attribute)1;
+constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39;
+
+void pass_error(const std::string_view &sv, const char *name, const char *description) {
+  BOOST_LOG(error) << sv << name << ':' << description;
+}
+
 void cff(CudaFunctions *cf) {
  cuda_free_functions(&cf);
 }
@@ -151,7 +160,7 @@ int init() {
  return 0;
 }

-class cuda_t : public platf::hwdevice_t {
+class opengl_t : public platf::hwdevice_t {
 public:
  int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
    if(!cdf) {
@@ -273,16 +282,203 @@ public:
  int width, height;
 };

+class cuda_t : public platf::hwdevice_t {
+public:
+  ~cuda_t() override {
+    // sws_t needs to be destroyed while the context is active
+    if(sws) {
+      ctx_t ctx { cuda_ctx };
+
+      sws.reset();
+    }
+  }
+
+  int init(int in_width, int in_height) {
+    if(!cdf) {
+      BOOST_LOG(warning) << "cuda not initialized"sv;
+      return -1;
+    }
+
+    data = (void *)0x1;
+
+    width  = in_width;
+    height = in_height;
+
+    return 0;
+  }
+
+  int set_frame(AVFrame *frame) override {
+    this->hwframe.reset(frame);
+    this->frame = frame;
+
+    if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
+      BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
+      return -1;
+    }
+
+    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
+      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+
+      return -1;
+    }
+
+    cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
+
+    ctx_t ctx { cuda_ctx };
+    sws = sws_t::make(width * 4, height, frame->width, frame->height);
+
+    if(!sws) {
+      return -1;
+    }
+
+    return 0;
+  }
+
+  int convert(platf::img_t &img) override {
+    ctx_t ctx { cuda_ctx };
+
+    return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
+  }
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
+    ctx_t ctx { cuda_ctx };
+    sws->set_colorspace(colorspace, color_range);
+  }
+
+  frame_t hwframe;
+
+  std::unique_ptr<sws_t> sws;
+
+  int width, height;
+
+  CUcontext cuda_ctx;
+};
+
 std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
  if(init()) {
    return nullptr;
  }

  auto cuda = std::make_shared<cuda_t>();
-  if(cuda->init(width, height, xdisplay)) {
+  if(cuda->init(width, height)) {
    return nullptr;
  }

  return cuda;
 }
 } // namespace cuda
+
+namespace platf::nvfbc {
+static PNVFBCCREATEINSTANCE createInstance {};
+static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
+
+static void *handle { nullptr };
+int init() {
+  static bool funcs_loaded = false;
+
+  if(funcs_loaded) return 0;
+
+  if(!handle) {
+    handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" });
+    if(!handle) {
+      return -1;
+    }
+  }
+
+  std::vector<std::tuple<dyn::apiproc *, const char *>> funcs {
+    { (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" },
+  };
+
+  if(dyn::load(handle, funcs)) {
+    dlclose(handle);
+    handle = nullptr;
+
+    return -1;
+  }
+
+  funcs_loaded = true;
+  return 0;
+}
+
+class handle_t {
+  KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
+    if(el == std::numeric_limits<std::uint64_t>::max()) {
+      return;
+    }
+    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
+
+    auto status = func.nvFBCDestroyHandle(el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
+    }
+  });
+
+public:
+  static std::optional<handle_t> make() {
+    NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
+    session_t session;
+
+    auto status = func.nvFBCCreateHandle(&session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
+      session.release();
+
+      return std::nullopt;
+    }
+
+    return handle_t { std::move(session) };
+  }
+
+  const char *last_error() {
+    return func.nvFBCGetLastErrorStr(session.el);
+  }
+
+  std::optional<NVFBC_GET_STATUS_PARAMS> status() {
+    NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
+
+    auto status = func.nvFBCGetStatus(session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << last_error();
+
+      return std::nullopt;
+    }
+
+    return params;
+  }
+
+  session_t session;
+};
+
+std::vector<std::string> nvfbc_display_names() {
+  if(init()) {
+    return {};
+  }
+
+  std::vector<std::string> display_names;
+
+  auto status = createInstance(&func);
+  if(status) {
+    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
+    return {};
+  }
+
+  auto handle = handle_t::make();
+  if(!handle) {
+    return {};
+  }
+
+  auto status_params = handle->status();
+  if(!status_params) {
+    return {};
+  }
+
+  if(!status_params->bIsCapturePossible) {
+    BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv;
+  }
+
+  BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
+  BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
+
+  return display_names;
+}
+} // namespace platf::nvfbc
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -0,0 +1,248 @@
+// #include <algorithm>
+#include <helper_math.h>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string_view>
+
+#include "cuda.h"
+
+using namespace std::literals;
+
+#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
+
+#define CU_CHECK(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
+
+#define CU_CHECK_VOID(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return;
+
+#define CU_CHECK_PTR(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
+
+#define CU_CHECK_IGNORE(x, y) \
+  check((x), SUNSHINE_STRINGVIEW(y ": "))
+
+using namespace std::literals;
+
+//////////////////// Special desclarations
+/**
+ * NVCC segfaults when including <chrono>
+ * Therefore, some declarations need to be added explicitely
+ */
+namespace platf {
+struct img_t {
+public:
+  std::uint8_t *data {};
+  std::int32_t width {};
+  std::int32_t height {};
+  std::int32_t pixel_pitch {};
+  std::int32_t row_pitch {};
+
+  virtual ~img_t() = default;
+};
+} // namespace platf
+
+namespace video {
+using __float4 = float[4];
+using __float3 = float[3];
+using __float2 = float[2];
+
+struct __attribute__((__aligned__(16))) color_t {
+  float4 color_vec_y;
+  float4 color_vec_u;
+  float4 color_vec_v;
+  float2 range_y;
+  float2 range_uv;
+};
+
+struct __attribute__((__aligned__(16))) color_extern_t {
+  __float4 color_vec_y;
+  __float4 color_vec_u;
+  __float4 color_vec_v;
+  __float2 range_y;
+  __float2 range_uv;
+};
+
+extern color_extern_t colors[4];
+} // namespace video
+
+//////////////////// End special declarations
+
+namespace cuda {
+auto constexpr INVALID_TEXTURE = std::numeric_limits<cudaTextureObject_t>::max();
+
+template<class T>
+inline T div_align(T l, T r) {
+  return (l + r - 1) / r;
+}
+
+void pass_error(const std::string_view &sv, const char *name, const char *description);
+inline static int check(cudaError_t result, const std::string_view &sv) {
+  if(result) {
+    auto name        = cudaGetErrorName(result);
+    auto description = cudaGetErrorString(result);
+
+    pass_error(sv, name, description);
+    return -1;
+  }
+
+  return 0;
+}
+
+__device__ __constant__ video::color_t color;
+
+
+inline __device__ float3 bgra_to_rgb(uchar4 vec) {
+  return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
+}
+
+inline __device__ float2 calcUV(float3 pixel) {
+  float4 vec_u = color.color_vec_u;
+  float4 vec_v = color.color_vec_v;
+
+  float u = dot(pixel, make_float3(vec_u)) + vec_u.w;
+  float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
+
+  u = u * color.range_uv.x + color.range_uv.y;
+  v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
+
+  return make_float2(u, v);
+}
+
+inline __device__ float calcY(float3 pixel) {
+  float4 vec_y = color.color_vec_y;
+
+  return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y;
+}
+
+__global__ void RGBA_to_NV12(
+  cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
+  std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
+  std::uint32_t width, std::uint32_t height) {
+
+  int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
+  int idY = (threadIdx.y + blockDim.y * blockIdx.y);
+
+  if(idX >= width) return;
+  if(idY >= height) return;
+
+  dstY  = dstY + idX + idY * dstPitchY;
+  dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
+
+  float x = (float)idX / (float)width / 4;
+  float y = (float)idY / (float)height;
+
+  float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
+  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
+
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f);
+
+  dstUV[0] = uv.x;
+  dstUV[1] = uv.y;
+  dstY[0]  = calcY(rgb_l);
+  dstY[1]  = calcY(rgb_r);
+}
+
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock)
+    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } {
+  auto format = cudaCreateChannelDesc<uchar4>();
+
+  CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+
+  cudaResourceDesc res {};
+  res.resType         = cudaResourceTypeArray;
+  res.res.array.array = array;
+
+  cudaTextureDesc desc {};
+
+  desc.readMode         = cudaReadModeElementType;
+  desc.filterMode       = cudaFilterModePoint;
+  desc.normalizedCoords = true;
+
+  std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
+
+  CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+}
+
+sws_t::~sws_t() {
+  if(texture != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+
+    texture = INVALID_TEXTURE;
+  }
+
+  if(array) {
+    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
+
+    array = cudaArray_t {};
+  }
+}
+
+std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height) {
+  cudaDeviceProp props;
+  int device;
+  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+
+  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2);
+
+  if(sws->texture == INVALID_TEXTURE) {
+    return nullptr;
+  }
+
+  return sws;
+}
+
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
+  int threadsX = width / 2;
+  int threadsY = height;
+
+  dim3 block(threadsPerBlock, threadsPerBlock);
+  dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
+
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height);
+
+  return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
+}
+
+void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
+  color_range = 1;
+  colorspace = 5;
+  video::color_extern_t *color_p;
+  switch(colorspace) {
+  case 5: // SWS_CS_SMPTE170M
+    color_p = &video::colors[0];
+    break;
+  case 1: // SWS_CS_ITU709
+    color_p = &video::colors[2];
+    break;
+  case 9: // SWS_CS_BT2020
+  default:
+    color_p = &video::colors[0];
+  };
+
+  if(color_range > 1) {
+    // Full range
+    ++color_p;
+  }
+
+  auto color_matrix = *(video::color_t*)color_p;
+  color_matrix.color_vec_y.w *= 256.0f;
+  color_matrix.color_vec_u.w *= 256.0f;
+  color_matrix.color_vec_v.w *= 256.0f;
+
+  color_matrix.range_y.y *= 256.0f;
+  color_matrix.range_uv.y *= 256.0f;
+
+  static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
+
+  CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda");
+}
+
+int sws_t::load_ram(platf::img_t &img) {
+  return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
+}
+
+} // namespace cuda
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,6 +1,8 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H

+#ifndef __NVCC__
+
 #include "sunshine/platform/common.h"
 #include "x11grab.h"

@@ -9,4 +11,48 @@ std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x
 int init();
 } // namespace cuda

+#else
+namespace platf {
+class img_t;
+}
+#endif
+
+typedef struct cudaArray *cudaArray_t;
+
+#if !defined(__CUDACC__)
+typedef unsigned long long cudaTextureObject_t;
+#else  /* defined(__CUDACC__) */
+typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
+#endif /* !defined(__CUDACC__) */
+
+namespace cuda {
+class sws_t {
+public:
+  ~sws_t();
+  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock);
+
+  /**
+   * in_width, out_width -- The width and height of the captured image in bytes
+   * out_width, out_height -- the width and height of the NV12 image in pixels
+   * 
+   * cuda_device -- pointer to the cuda device
+   */
+  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height);
+
+  // Converts loaded image into a CUDevicePtr
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
+
+  int load_ram(platf::img_t &img);
+
+  cudaArray_t array;
+  cudaTextureObject_t texture;
+
+  int width, height;
+
+  int threadsPerBlock;
+};
+} // namespace cuda
+
 #endif
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -324,7 +324,7 @@ struct encoder_t {
 class session_t {
 public:
  session_t() = default;
-  session_t(ctx_t &&ctx, util::wrap_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}
+  session_t(ctx_t &&ctx, std::shared_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}

  session_t(session_t &&other) noexcept = default;

@@ -342,7 +342,7 @@ public:
  }

  ctx_t ctx;
-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;

  std::vector<packet_raw_t::replace_t> replacements;

@@ -369,7 +369,6 @@ struct sync_session_t {
  sync_session_ctx_t *ctx;

  platf::img_t *img_tmp;
-  std::shared_ptr<platf::hwdevice_t> hwdevice;
  session_t session;
 };

@@ -779,7 +778,7 @@ int encode(int64_t frame_nr, session_t &session, frame_t::pointer frame, safe::m
  return 0;
 }

-std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, platf::hwdevice_t *hwdevice) {
+std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, std::shared_ptr<platf::hwdevice_t> &&hwdevice) {
  bool hardware = encoder.dev_type != AV_HWDEVICE_TYPE_NONE;

  auto &video_format = config.videoFormat == 0 ? encoder.h264 : encoder.hevc;
@@ -886,7 +885,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
  if(hardware) {
    ctx->pix_fmt = encoder.dev_pix_fmt;

-    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice);
+    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice.get());
    if(buf_or_error.has_right()) {
      return std::nullopt;
    }
@@ -965,7 +964,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
    frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
  }

-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;

  if(!hwdevice->data) {
    auto device_tmp = std::make_unique<swdevice_t>();
@@ -977,7 +976,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
    device = std::move(device_tmp);
  }
  else {
-    device = hwdevice;
+    device = std::move(hwdevice);
  }

  if(device->set_frame(frame.release())) {
@@ -1009,12 +1008,12 @@ void encode_run(
  img_event_t images,
  config_t config,
  int width, int height,
-  platf::hwdevice_t *hwdevice,
+  std::shared_ptr<platf::hwdevice_t> &&hwdevice,
  safe::signal_t &reinit_event,
  const encoder_t &encoder,
  void *channel_data) {

-  auto session = make_session(encoder, config, width, height, hwdevice);
+  auto session = make_session(encoder, config, width, height, std::move(hwdevice));
  if(!session) {
    return;
  }
@@ -1101,12 +1100,11 @@ std::optional<sync_session_t> make_synced_session(platf::display_t *disp, const
  // absolute mouse coordinates require that the dimensions of the screen are known
  ctx.touch_port_events->raise(make_port(disp, ctx.config));

-  auto session = make_session(encoder, ctx.config, img.width, img.height, hwdevice.get());
+  auto session = make_session(encoder, ctx.config, img.width, img.height, std::move(hwdevice));
  if(!session) {
    return std::nullopt;
  }

-  encode_session.hwdevice = std::move(hwdevice);
  encode_session.session  = std::move(*session);

  return std::move(encode_session);
@@ -1208,7 +1206,7 @@ encode_e encode_run_sync(
          ctx->idr_events->pop();
        }

-        if(pos->hwdevice->convert(*img)) {
+        if(pos->session.device->convert(*img)) {
          BOOST_LOG(error) << "Could not convert image"sv;
          ctx->shutdown_event->raise(true);

@@ -1356,7 +1354,7 @@ void capture_async(
      frame_nr,
      mail, images,
      config, display->width, display->height,
-      hwdevice.get(),
+      std::move(hwdevice),
      ref->reinit_event, *ref->encoder_p,
      channel_data);
  }
@@ -1409,7 +1407,7 @@ int validate_config(std::shared_ptr<platf::display_t> &disp, const encoder_t &en
    return -1;
  }

-  auto session = make_session(encoder, config, disp->width, disp->height, hwdevice.get());
+  auto session = make_session(encoder, config, disp->width, disp->height, std::move(hwdevice));
  if(!session) {
    return -1;
  }
--- a/third-party/nvfbc/NvFBC.h
+++ b/third-party/nvfbc/NvFBC.h
--- a/third-party/nvfbc/helper_math.h
+++ b/third-party/nvfbc/helper_math.h