I’m trying to copy camera frames from Tegra memory (NVBUF_MEM_SURFACE_ARRAY) to dGPU (RTX 6000 Ada) for processing. My current code was only able to achieve 2-3GB/s bandwidth. I profiled the code and seems most of the time is spent in memcpy.
Here’s a simplified version of my camera processing code. Is there something wrong with the copying approach?
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <vector>
#include <cuda_runtime.h>
#include "nvbufsurface.h"
struct StreamContext {
int stream_id;
int width;
int height;
int iterations;
NvBufSurface *tegra_buffer;
void *cuda_uyvy_buffer;
cudaStream_t stream;
std::vector<double> latencies_ms;
};
double timespec_diff_ms(const struct timespec *start, const struct timespec *end) {
return (end->tv_sec - start->tv_sec) * 1000.0 +
(end->tv_nsec - start->tv_nsec) / 1000000.0;
}
void* run_stream_in_thread(void *arg) {
StreamContext *ctx = (StreamContext *)arg;
cudaError_t err = cudaSetDevice(0);
if (err != cudaSuccess) {
fprintf(stderr, "Thread %d: Failed to set CUDA device: %s\n",
ctx->stream_id, cudaGetErrorString(err));
return NULL;
}
ctx->latencies_ms.reserve(ctx->iterations);
NvBufSurface *in_surf = ctx->tegra_buffer;
for (int iter = 0; iter < ctx->iterations; iter++) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
int ret = NvBufSurfaceMap(in_surf, 0, -1, NVBUF_MAP_READ);
if (ret != 0) {
fprintf(stderr, "Stream %d: Failed to map input surface\n", ctx->stream_id);
continue;
}
ret = NvBufSurfaceSyncForCpu(in_surf, 0, -1);
if (ret != 0)
fprintf(stderr, "Stream %d: Failed to sync for CPU\n", ctx->stream_id);
NvBufSurfaceParams *in_params = &in_surf->surfaceList[0];
size_t plane_size = in_params->planeParams.pitch[0] * in_params->planeParams.height[0];
err = cudaMemcpyAsync(ctx->cuda_uyvy_buffer, in_params->mappedAddr.addr[0], plane_size,
cudaMemcpyHostToDevice, ctx->stream);
if (err != cudaSuccess) {
fprintf(stderr, "Stream %d: Failed to copy UYVY: %s\n",
ctx->stream_id, cudaGetErrorString(err));
NvBufSurfaceUnMap(in_surf, 0, -1);
continue;
}
err = cudaStreamSynchronize(ctx->stream);
if (err != cudaSuccess) {
fprintf(stderr, "Stream %d: Stream synchronize after copy failed: %s\n",
ctx->stream_id, cudaGetErrorString(err));
NvBufSurfaceUnMap(in_surf, 0, -1);
continue;
}
clock_gettime(CLOCK_MONOTONIC, &end);
NvBufSurfaceUnMap(in_surf, 0, -1);
ctx->latencies_ms.push_back(timespec_diff_ms(&start, &end));
}
return NULL;
}
void print_results(StreamContext *streams, int num_streams, double total_time_sec) {
size_t total_samples = 0;
printf("%-10s %8s %8s\n", "Stream", "Avg(ms)", "Max(ms)");
printf("%-10s %8s %8s\n", "----------", "--------", "--------");
for (int i = 0; i < num_streams; i++) {
double sum = 0, max_val = 0;
for (double latency : streams[i].latencies_ms) {
sum += latency;
if (latency > max_val) max_val = latency;
}
total_samples += streams[i].latencies_ms.size();
char stream_name[16];
snprintf(stream_name, sizeof(stream_name), "Stream %d", i);
printf("%-10s %8.2f %8.2f\n", stream_name, sum / streams[i].latencies_ms.size(), max_val);
}
printf("\nBandwidth: %.2f MB/s\n",
(streams[0].width * streams[0].height * 2 * total_samples) / (total_time_sec * 1024.0 * 1024.0));
}
int main(int argc, char *argv[]) {
int num_streams = 8;
int width = 3840;
int height = 2160;
int iterations = 500;
if (argc > 1) {
num_streams = atoi(argv[1]);
if (num_streams < 1 || num_streams > 32)
return fprintf(stderr, "Invalid number of streams (must be 1-32)\n"), 1;
}
if (argc > 2) {
width = atoi(argv[2]);
if (width < 1 || width > 7680)
return fprintf(stderr, "Invalid width (must be 1-7680)\n"), 1;
}
if (argc > 3) {
height = atoi(argv[3]);
if (height < 1 || height > 4320)
return fprintf(stderr, "Invalid height (must be 1-4320)\n"), 1;
}
if (argc > 4) {
iterations = atoi(argv[4]);
if (iterations < 1 || iterations > 10000)
return fprintf(stderr, "Invalid iterations (must be 1-10000)\n"), 1;
}
cudaError_t err = cudaSetDevice(0);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to set CUDA device: %s\n", cudaGetErrorString(err));
return 1;
}
StreamContext *streams = new StreamContext[num_streams];
for (int i = 0; i < num_streams; i++) {
streams[i].stream_id = i;
streams[i].width = width;
streams[i].height = height;
streams[i].iterations = iterations;
NvBufSurfaceAllocateParams alloc_params = {0};
alloc_params.params.gpuId = 0;
alloc_params.params.width = width;
alloc_params.params.height = height;
alloc_params.params.colorFormat = NVBUF_COLOR_FORMAT_UYVY;
alloc_params.params.layout = NVBUF_LAYOUT_PITCH;
alloc_params.params.memType = NVBUF_MEM_SURFACE_ARRAY;
alloc_params.memtag = NvBufSurfaceTag_NONE;
int ret = NvBufSurfaceAllocate(&streams[i].tegra_buffer, 1, &alloc_params);
if (ret != 0) {
fprintf(stderr, "Failed to allocate NvBufSurface: %d\n", ret);
return 1;
}
streams[i].tegra_buffer->numFilled = 1;
NvBufSurfaceParams *params = &streams[i].tegra_buffer->surfaceList[0];
err = cudaMalloc(&streams[i].cuda_uyvy_buffer,
params->planeParams.pitch[0] * params->planeParams.height[0]);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate CUDA UYVY buffer for stream %d: %s\n",
i, cudaGetErrorString(err));
return 1;
}
err = cudaStreamCreate(&streams[i].stream);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to create CUDA stream for stream %d: %s\n",
i, cudaGetErrorString(err));
return 1;
}
}
struct timespec bench_start, bench_end;
clock_gettime(CLOCK_MONOTONIC, &bench_start);
pthread_t *threads = new pthread_t[num_streams];
for (int i = 0; i < num_streams; i++)
pthread_create(&threads[i], NULL, run_stream_in_thread, &streams[i]);
for (int i = 0; i < num_streams; i++)
pthread_join(threads[i], NULL);
delete[] threads;
clock_gettime(CLOCK_MONOTONIC, &bench_end);
print_results(streams, num_streams, timespec_diff_ms(&bench_start, &bench_end) / 1000.0);
for (int i = 0; i < num_streams; i++) {
cudaStreamDestroy(streams[i].stream);
cudaFree(streams[i].cuda_uyvy_buffer);
NvBufSurfaceDestroy(streams[i].tegra_buffer);
}
delete[] streams;
return 0;
}
Result of copying with 1 thread and 8 threads respectively:
=== 1 stream, 4K ===
Stream Avg(ms) Max(ms)
---------- -------- --------
Stream 0 6.27 6.49
Bandwidth: 2392.32 MB/s
=== 8 streams, 4K ===
Stream Avg(ms) Max(ms)
---------- -------- --------
Stream 0 46.18 48.26
Stream 1 46.19 48.30
Stream 2 46.21 48.44
Stream 3 46.20 48.38
Stream 4 46.22 48.30
Stream 5 46.23 48.10
Stream 6 46.24 48.12
Stream 7 46.25 48.20
Bandwidth: 2715.03 MB/s
