Implement S/G IO for batched sends and eliminate another frame copy (#2874)

2025-08-10 00:52:16 +00:00 · 2024-07-18 23:04:18 -05:00
parent 16f62436a9
commit 3935d305ae
6 changed files with 199 additions and 56 deletions
--- a/src/stream.cpp
+++ b/src/stream.cpp
@@ -126,11 +126,6 @@ namespace stream {
  };

  struct video_packet_enc_prefix_t {
-    video_packet_raw_t *
-    payload() {
-      return (video_packet_raw_t *) (this + 1);
-    }
-
    std::uint8_t iv[12];  // 12-byte IV is ideal for AES-GCM
    std::uint32_t frameNumber;
    std::uint8_t tag[16];
@@ -227,7 +222,6 @@ namespace stream {
  }
  constexpr std::size_t MAX_AUDIO_PACKET_SIZE = 1400;

-  using video_packet_t = util::c_ptr<video_packet_raw_t>;
  using audio_aes_t = std::array<char, round_to_pkcs7_padded(MAX_AUDIO_PACKET_SIZE)>;

  using av_session_id_t = std::variant<asio::ip::address, std::string>;  // IP address or SS-Ping-Payload from RTSP handshake
@@ -619,15 +613,19 @@ namespace stream {
      size_t blocksize;
      size_t prefixsize;
      util::buffer_t<char> shards;
+      util::buffer_t<char> headers;
+      util::buffer_t<uint8_t *> shards_p;
+
+      std::vector<platf::buffer_descriptor_t> payload_buffers;

      char *
      data(size_t el) {
-        return &shards[(el + 1) * prefixsize + el * blocksize];
+        return (char *) shards_p[el];
      }

      char *
      prefix(size_t el) {
-        return &shards[el * (prefixsize + blocksize)];
+        return prefixsize ? &headers[el * prefixsize] : nullptr;
      }

      size_t
@@ -642,7 +640,8 @@ namespace stream {

      auto pad = payload_size % blocksize != 0;

-      auto data_shards = payload_size / blocksize + (pad ? 1 : 0);
+      auto aligned_data_shards = payload_size / blocksize;
+      auto data_shards = aligned_data_shards + (pad ? 1 : 0);
      auto parity_shards = (data_shards * fecpercentage + 99) / 100;

      // increase the FEC percentage for this frame if the parity shard minimum is not met
@@ -655,27 +654,46 @@ namespace stream {

      auto nr_shards = data_shards + parity_shards;

-      util::buffer_t<char> shards { nr_shards * (blocksize + prefixsize) };
+      // If we need to store a zero-padded data shard, allocate that first to
+      // to keep the shards in order and reduce buffer fragmentation
+      auto parity_shard_offset = pad ? 1 : 0;
+      util::buffer_t<char> shards { (parity_shard_offset + parity_shards) * blocksize };
      util::buffer_t<uint8_t *> shards_p { nr_shards };
+      std::vector<platf::buffer_descriptor_t> payload_buffers;
+      payload_buffers.reserve(2);

+      // Point into the payload buffer for all except the final padded data shard
      auto next = std::begin(payload);
-      for (auto x = 0; x < nr_shards; ++x) {
-        shards_p[x] = (uint8_t *) &shards[(x + 1) * prefixsize + x * blocksize];
+      for (auto x = 0; x < aligned_data_shards; ++x) {
+        shards_p[x] = (uint8_t *) next;
+        next += blocksize;
+      }
+      payload_buffers.emplace_back(std::begin(payload), aligned_data_shards * blocksize);
+
+      // If the last data shard needs to be zero-padded, we must use the shards buffer
+      if (pad) {
+        shards_p[aligned_data_shards] = (uint8_t *) &shards[0];

        // GCC doesn't figure out that std::copy_n() can be replaced with memcpy() here
        // and ends up compiling a horribly slow element-by-element copy loop, so we
        // help it by using memcpy()/memset() directly.
        auto copy_len = std::min<size_t>(blocksize, std::end(payload) - next);
-        std::memcpy(shards_p[x], next, copy_len);
+        std::memcpy(shards_p[aligned_data_shards], next, copy_len);
        if (copy_len < blocksize) {
          // Zero any additional space after the end of the payload
-          std::memset(shards_p[x] + copy_len, 0, blocksize - copy_len);
+          std::memset(shards_p[aligned_data_shards] + copy_len, 0, blocksize - copy_len);
        }
-
-        next += copy_len;
      }

+      // Add a payload buffer describing the shard buffer
+      payload_buffers.emplace_back(std::begin(shards), shards.size());
+
      if (fecpercentage != 0) {
+        // Point into our allocated buffer for the parity shards
+        for (auto x = 0; x < parity_shards; ++x) {
+          shards_p[data_shards + x] = (uint8_t *) &shards[(parity_shard_offset + x) * blocksize];
+        }
+
        // packets = parity_shards + data_shards
        rs_t rs { reed_solomon_new(data_shards, parity_shards) };

@@ -688,7 +706,10 @@ namespace stream {
        fecpercentage,
        blocksize,
        prefixsize,
-        std::move(shards)
+        std::move(shards),
+        util::buffer_t<char> { nr_shards * prefixsize },
+        std::move(shards_p),
+        std::move(payload_buffers),
      };
    }
  }  // namespace fec
@@ -1438,8 +1459,11 @@ namespace stream {

          auto peer_address = session->video.peer.address();
          auto batch_info = platf::batched_send_info_t {
-            nullptr,
-            shards.prefixsize + shards.blocksize,
+            shards.headers.begin(),
+            shards.prefixsize,
+            shards.payload_buffers,
+            shards.blocksize,
+            0,
            0,
            (uintptr_t) sock.native_handle(),
            peer_address,
@@ -1487,7 +1511,8 @@ namespace stream {
              auto *prefix = (video_packet_enc_prefix_t *) shards.prefix(x);
              prefix->frameNumber = packet->frame_index();
              std::copy(std::begin(iv), std::end(iv), prefix->iv);
-              session->video.cipher->encrypt(std::string_view { (char *) inspect, (size_t) blocksize }, prefix->tag, &iv);
+              session->video.cipher->encrypt(std::string_view { (char *) inspect, (size_t) blocksize },
+                prefix->tag, (uint8_t *) inspect, &iv);
            }

            if (x - next_shard_to_send + 1 >= send_batch_size ||
@@ -1510,7 +1535,7 @@ namespace stream {
              }

              size_t current_batch_size = x - next_shard_to_send + 1;
-              batch_info.buffer = shards.prefix(next_shard_to_send);
+              batch_info.block_offset = next_shard_to_send;
              batch_info.block_count = current_batch_size;

              frame_send_batch_latency_logger.first_point_now();
@@ -1520,10 +1545,10 @@ namespace stream {
                BOOST_LOG(verbose) << "Falling back to unbatched send"sv;
                for (auto y = 0; y < current_batch_size; y++) {
                  auto send_info = platf::send_info_t {
-                    nullptr,
-                    0,
                    shards.prefix(next_shard_to_send + y),
-                    shards.prefixsize + shards.blocksize,
+                    shards.prefixsize,
+                    shards.data(next_shard_to_send + y),
+                    shards.blocksize,
                    (uintptr_t) sock.native_handle(),
                    peer_address,
                    session->video.peer.port(),