Improving Tegra memory => dGPU copy performance

max.ng · October 20, 2025, 3:03am

I’m trying to copy camera frames from Tegra memory (NVBUF_MEM_SURFACE_ARRAY) to dGPU (RTX 6000 Ada) for processing. My current code was only able to achieve 2-3GB/s bandwidth. I profiled the code and seems most of the time is spent in memcpy.

Here’s a simplified version of my camera processing code. Is there something wrong with the copying approach?

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <vector>
#include <cuda_runtime.h>
#include "nvbufsurface.h"

struct StreamContext {
    int stream_id;
    int width;
    int height;
    int iterations;
    NvBufSurface *tegra_buffer;
    void *cuda_uyvy_buffer;
    cudaStream_t stream;
    std::vector<double> latencies_ms;
};

double timespec_diff_ms(const struct timespec *start, const struct timespec *end) {
    return (end->tv_sec - start->tv_sec) * 1000.0 +
           (end->tv_nsec - start->tv_nsec) / 1000000.0;
}

void* run_stream_in_thread(void *arg) {
    StreamContext *ctx = (StreamContext *)arg;
    cudaError_t err = cudaSetDevice(0);
    if (err != cudaSuccess) {
        fprintf(stderr, "Thread %d: Failed to set CUDA device: %s\n",
                ctx->stream_id, cudaGetErrorString(err));
        return NULL;
    }

    ctx->latencies_ms.reserve(ctx->iterations);
    NvBufSurface *in_surf = ctx->tegra_buffer;

    for (int iter = 0; iter < ctx->iterations; iter++) {
        struct timespec start, end;
        clock_gettime(CLOCK_MONOTONIC, &start);

        int ret = NvBufSurfaceMap(in_surf, 0, -1, NVBUF_MAP_READ);
        if (ret != 0) {
            fprintf(stderr, "Stream %d: Failed to map input surface\n", ctx->stream_id);
            continue;
        }

        ret = NvBufSurfaceSyncForCpu(in_surf, 0, -1);
        if (ret != 0)
            fprintf(stderr, "Stream %d: Failed to sync for CPU\n", ctx->stream_id);

        NvBufSurfaceParams *in_params = &in_surf->surfaceList[0];
        size_t plane_size = in_params->planeParams.pitch[0] * in_params->planeParams.height[0];

        err = cudaMemcpyAsync(ctx->cuda_uyvy_buffer, in_params->mappedAddr.addr[0], plane_size,
                              cudaMemcpyHostToDevice, ctx->stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Stream %d: Failed to copy UYVY: %s\n",
                    ctx->stream_id, cudaGetErrorString(err));
            NvBufSurfaceUnMap(in_surf, 0, -1);
            continue;
        }

        err = cudaStreamSynchronize(ctx->stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Stream %d: Stream synchronize after copy failed: %s\n",
                    ctx->stream_id, cudaGetErrorString(err));
            NvBufSurfaceUnMap(in_surf, 0, -1);
            continue;
        }

        clock_gettime(CLOCK_MONOTONIC, &end);
        NvBufSurfaceUnMap(in_surf, 0, -1);
        ctx->latencies_ms.push_back(timespec_diff_ms(&start, &end));
    }

    return NULL;
}


void print_results(StreamContext *streams, int num_streams, double total_time_sec) {
    size_t total_samples = 0;

    printf("%-10s  %8s  %8s\n", "Stream", "Avg(ms)", "Max(ms)");
    printf("%-10s  %8s  %8s\n", "----------", "--------", "--------");

    for (int i = 0; i < num_streams; i++) {
        double sum = 0, max_val = 0;
        for (double latency : streams[i].latencies_ms) {
            sum += latency;
            if (latency > max_val) max_val = latency;
        }
        total_samples += streams[i].latencies_ms.size();

        char stream_name[16];
        snprintf(stream_name, sizeof(stream_name), "Stream %d", i);
        printf("%-10s  %8.2f  %8.2f\n", stream_name, sum / streams[i].latencies_ms.size(), max_val);
    }

    printf("\nBandwidth: %.2f MB/s\n",
           (streams[0].width * streams[0].height * 2 * total_samples) / (total_time_sec * 1024.0 * 1024.0));
}

int main(int argc, char *argv[]) {
    int num_streams = 8;
    int width = 3840;
    int height = 2160;
    int iterations = 500;

    if (argc > 1) {
        num_streams = atoi(argv[1]);
        if (num_streams < 1 || num_streams > 32)
            return fprintf(stderr, "Invalid number of streams (must be 1-32)\n"), 1;
    }
    if (argc > 2) {
        width = atoi(argv[2]);
        if (width < 1 || width > 7680)
            return fprintf(stderr, "Invalid width (must be 1-7680)\n"), 1;
    }
    if (argc > 3) {
        height = atoi(argv[3]);
        if (height < 1 || height > 4320)
            return fprintf(stderr, "Invalid height (must be 1-4320)\n"), 1;
    }
    if (argc > 4) {
        iterations = atoi(argv[4]);
        if (iterations < 1 || iterations > 10000)
            return fprintf(stderr, "Invalid iterations (must be 1-10000)\n"), 1;
    }

    cudaError_t err = cudaSetDevice(0);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to set CUDA device: %s\n", cudaGetErrorString(err));
        return 1;
    }

    StreamContext *streams = new StreamContext[num_streams];

    for (int i = 0; i < num_streams; i++) {
        streams[i].stream_id = i;
        streams[i].width = width;
        streams[i].height = height;
        streams[i].iterations = iterations;

        NvBufSurfaceAllocateParams alloc_params = {0};
        alloc_params.params.gpuId = 0;
        alloc_params.params.width = width;
        alloc_params.params.height = height;
        alloc_params.params.colorFormat = NVBUF_COLOR_FORMAT_UYVY;
        alloc_params.params.layout = NVBUF_LAYOUT_PITCH;
        alloc_params.params.memType = NVBUF_MEM_SURFACE_ARRAY;
        alloc_params.memtag = NvBufSurfaceTag_NONE;

        int ret = NvBufSurfaceAllocate(&streams[i].tegra_buffer, 1, &alloc_params);
        if (ret != 0) {
            fprintf(stderr, "Failed to allocate NvBufSurface: %d\n", ret);
            return 1;
        }
        streams[i].tegra_buffer->numFilled = 1;

        NvBufSurfaceParams *params = &streams[i].tegra_buffer->surfaceList[0];
        err = cudaMalloc(&streams[i].cuda_uyvy_buffer,
                         params->planeParams.pitch[0] * params->planeParams.height[0]);
        if (err != cudaSuccess) {
            fprintf(stderr, "Failed to allocate CUDA UYVY buffer for stream %d: %s\n",
                    i, cudaGetErrorString(err));
            return 1;
        }

        err = cudaStreamCreate(&streams[i].stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Failed to create CUDA stream for stream %d: %s\n",
                    i, cudaGetErrorString(err));
            return 1;
        }
    }

    struct timespec bench_start, bench_end;
    clock_gettime(CLOCK_MONOTONIC, &bench_start);

    pthread_t *threads = new pthread_t[num_streams];
    for (int i = 0; i < num_streams; i++)
        pthread_create(&threads[i], NULL, run_stream_in_thread, &streams[i]);
    for (int i = 0; i < num_streams; i++)
        pthread_join(threads[i], NULL);
    delete[] threads;

    clock_gettime(CLOCK_MONOTONIC, &bench_end);
    print_results(streams, num_streams, timespec_diff_ms(&bench_start, &bench_end) / 1000.0);

    for (int i = 0; i < num_streams; i++) {
        cudaStreamDestroy(streams[i].stream);
        cudaFree(streams[i].cuda_uyvy_buffer);
        NvBufSurfaceDestroy(streams[i].tegra_buffer);
    }

    delete[] streams;

    return 0;
}

Result of copying with 1 thread and 8 threads respectively:

     === 1 stream, 4K ===
     Stream       Avg(ms)   Max(ms)
     ----------  --------  --------
     Stream 0        6.27      6.49

     Bandwidth: 2392.32 MB/s

     === 8 streams, 4K ===
     Stream       Avg(ms)   Max(ms)
     ----------  --------  --------
     Stream 0       46.18     48.26
     Stream 1       46.19     48.30
     Stream 2       46.21     48.44
     Stream 3       46.20     48.38
     Stream 4       46.22     48.30
     Stream 5       46.23     48.10
     Stream 6       46.24     48.12
     Stream 7       46.25     48.20

     Bandwidth: 2715.03 MB/s

carolyuu · October 20, 2025, 3:30am

*** Please note that this reply is generated by LLM automatically ***

I’ll provide a comprehensive answer to help you optimize the camera frame copying process from Tegra memory (NVBUF_MEM_SURFACE_ARRAY) to the dGPU (RTX 6000 Ada) for processing on the Jetson AGX Orin.

Current Issues and Limitations

Low bandwidth: Your current implementation achieves only 23 GB/s bandwidth, which is relatively low considering the capabilities of the Jetson AGX Orin and the RTX 6000 Ada.
memcpy bottleneck: Profiling reveals that most of the time is spent in the memcpy function, indicating that the copying process is the primary bottleneck.

Optimization Suggestions

Use DMA (Direct Memory Access): Instead of using memcpy, consider utilizing the DMA engine on the Jetson AGX Orin to transfer data from Tegra memory to the dGPU. This can significantly reduce the CPU overhead and increase the transfer bandwidth.
Use CUDAMemcpy: If you still want to use a CPU-based approach, consider using cudaMemcpy instead of memcpy. cudaMemcpy is optimized for transferring data between the host (CPU) and the device (GPU) and can provide better performance.
Use multiple threads: You’ve already tried using 8 threads, but you can experiment with different thread counts to find the optimal number for your specific use case.
Use a more efficient memory layout: Ensure that the memory layout of your camera frames is optimized for the GPU. For example, using a pitch-linear memory layout can improve memory access patterns and reduce memory bandwidth usage.
Minimize memory allocations: Try to minimize the number of memory allocations and deallocations, as these can be expensive operations. Instead, allocate a large buffer and reuse it for each frame.
Use NVIDIA’s NvMedia: NvMedia is a library provided by NVIDIA that offers optimized functions for video and image processing on their GPUs. You can use NvMedia to accelerate the copying process and take advantage of the GPU’s capabilities.
Profile and optimize the GPU code: Once you’ve optimized the copying process, profile the GPU code to identify any performance bottlenecks and optimize the processing pipeline accordingly.

Comparison of Approaches

Approach	Bandwidth (GB/s)	CPU Overhead
`memcpy` (1 thread)	23	High
`memcpy` (8 threads)	23	Medium
DMA	Up to 100	Low
`cudaMemcpy`	Up to 100	Medium
NvMedia	Up to 100	Low

Note that the bandwidth values are estimates and may vary depending on your specific use case and system configuration.

By implementing these optimization suggestions, you should be able to significantly improve the bandwidth and reduce the CPU overhead of the camera frame copying process on the Jetson AGX Orin.

*** Please give the thumbs up if you get answers you like. Or provide feedback to help us improve the answer. ***

max.ng · October 20, 2025, 3:53pm

I’m already using cudaMemcpy, seems it’s calling the regular memcpy internally:

   - 31.56% cudaMemcpyAsync                                                                                                                                                                                                               ▒
      - 31.55% __cudart329                                                                                                                                                                                                                ▒
         - 31.53% __cudart193                                                                                                                                                                                                             ▒
            - 31.53% 0xffff83a9f8c0                                                                                                                                                                                                       ▒
               - 31.44% 0xffff8393b7b0                                                                                                                                                                                                    ▒
                  - 31.41% 0xffff8393b2f8                                                                                                                                                                                                 ▒
                     - 31.39% 0xffff8393af00                                                                                                                                                                                              ▒
                        - 31.37% 0xffff844330b0                                                                                                                                                                                           ▒
                           - 30.47% 0xffff83b0a950                                                                                                                                                                                        ▒
                              - 30.22% 0xffff83b034d0                                                                                                                                                                                     ▒
                                 - 30.21% __memcpy_generic                                                                                                                                                                                ▒
                                    - 0xffffa4e4132c1eac                                                                                                                                                                                  ▒
                                      0xffffa4e4144c9600                                                                                                                                                                                  ▒
                                      0xffffa4e4144c915c

DaneLLL · October 20, 2025, 11:18pm

Hi,
It looks like your implementation is good. Mapping NvBufSurface to CPU and copy it to GPU memory in dGPU.

The further enhancement is to map NvBufSurface to GPU and copy it to GPU memory in dGPU. However, this is not supported.

max.ng · October 20, 2025, 11:57pm

Thanks. I also tried calling cudaHostRegister on the mapped CPU memory, but it fails. So looks like I’m doing something reasonable already.

I have a follow up question – when I run the benchmark with 2 threads, they seem to be contended on a shared lock inside cudaMemcpyAsync. Each thread should have a separate CUDA stream already (err = cudaStreamCreate(&streams[i].stream);). Do you know what lock this is and how can I avoid it?

pthread_rwlock_wrlock
Begins: 0.743286s
Ends: 0.748471s (+5.185 ms)

Call stack at 0.743286s:
libpthread-2.31.so!__pthread_rwlock_wrlock
libcuda.so.570.169[6 Frames]
libcudart.so.11.8.89[2 Frames]
libcudart.so.11.8.89!cudaMemcpyAsync
perf_benchmark!run_stream_in_thread(...)
Nsight Systems frames
libpthread-2.31.so!start_thread
libc-2.31.so!thread_start

DaneLLL · October 21, 2025, 2:41am

Hi,
We would like to confirm your setup. Do you use AGX Orin or IGX Orin? It looks like your set up Orin + RTX 6000 Ada is identical to our IGX Orin developer kit.

max.ng · October 21, 2025, 4:00am

It’s IGX Orin.

DaneLLL · October 22, 2025, 6:00am

Hi,
Could you share which documents you refer to? On IGX Orin developer kit with IGX-SW1.1.2, we run docker to use dGPU. And do not use NvBufSurface by default. Would like to understand why you have the implementation.

max.ng · October 22, 2025, 3:15pm

I might have a non-standard set. To simplify deployment I’m trying to install Nvidia driver 570 directly on the machine (and not use Docker). I understand this will leave the iGPU/VIC unusable.

I was using NvBufSurface because that seem to be what the camera (or VI) outputs. Are there any alternatives?

DaneLLL · October 23, 2025, 2:30am

Hi,
We don’t suggest use NvBufSurface on IGX since it is not tested on IGX, although it may work since the libs are common between IGX and AGX Orin.

Would suggest develop your use-case in the docker image:
DeepStream | NVIDIA NGC

[Without dispay]

$ sudo docker pull nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa
$ sudo docker run -it --rm --runtime=nvidia --network=host -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video,graphics --gpus all --privileged -e DISPLAY=:0 -v /tmp/.X11-unix:/tmp/.X11-unix -v /etc/X11:/etc/X11 nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa

[With display]


$ export DISPLAY=:0
$ xhost +
$ sudo docker run -it --rm --runtime=nvidia --network=host -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video,graphics --gpus all --privileged -e DISPLAY=:0 -v /tmp/.X11-unix:/tmp/.X11-unix -v /etc/X11:/etc/X11 nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa

You may capture frame data into cudaMallocManaged() buffers directly(may refer to 18_v4l2_camera_cuda_rgb and port it to IGX Orin)

max.ng · October 28, 2025, 3:27pm

Thanks DaneLLL, I’ll take a look at your examples.

Topic		Replies	Views
Memory bandwidth on Orin Jetson AGX Orin	9	1675	March 15, 2024
CPU utilization using deepstream on Jetson AGX DeepStream SDK	5	329	March 18, 2024
Can dma be used to copy data in userspace on tx1 board? Jetson TX1	2	515	October 18, 2021
DeepStream zero-copy + run cuda kernel directly (Jetson Orin Nx) DeepStream SDK camera , cuda , kernel , gstreamer , cudnn , deepstream	9	199	August 22, 2025
Failed in mem copy for Deepstream python app for 3 USB cameras DeepStream SDK camera , gstreamer , jetson , deepstream	7	173	June 12, 2025
cudaMemcpyAsync low throughput Jetson TX2	17	2137	October 18, 2021
About zeor-cpoy between host and GPU Jetson AGX Orin cuda	3	263	May 27, 2024
Tensor RT memory copy Jetson TX2	8	2681	October 18, 2021
Performance of zero-copy on jetson TX1 Jetson TX1	9	2234	October 18, 2021
How to avoid double copy of image from EGL stream, when creating batched nvbuf surface DeepStream SDK camera , jetson-inference , nvbugs	15	801	March 20, 2024

Improving Tegra memory => dGPU copy performance

Related topics