Improving Tegra memory => dGPU copy performance

I’m trying to copy camera frames from Tegra memory (NVBUF_MEM_SURFACE_ARRAY) to dGPU (RTX 6000 Ada) for processing. My current code was only able to achieve 2-3GB/s bandwidth. I profiled the code and seems most of the time is spent in memcpy.

Here’s a simplified version of my camera processing code. Is there something wrong with the copying approach?

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <vector>
#include <cuda_runtime.h>
#include "nvbufsurface.h"

struct StreamContext {
    int stream_id;
    int width;
    int height;
    int iterations;
    NvBufSurface *tegra_buffer;
    void *cuda_uyvy_buffer;
    cudaStream_t stream;
    std::vector<double> latencies_ms;
};

double timespec_diff_ms(const struct timespec *start, const struct timespec *end) {
    return (end->tv_sec - start->tv_sec) * 1000.0 +
           (end->tv_nsec - start->tv_nsec) / 1000000.0;
}

void* run_stream_in_thread(void *arg) {
    StreamContext *ctx = (StreamContext *)arg;
    cudaError_t err = cudaSetDevice(0);
    if (err != cudaSuccess) {
        fprintf(stderr, "Thread %d: Failed to set CUDA device: %s\n",
                ctx->stream_id, cudaGetErrorString(err));
        return NULL;
    }

    ctx->latencies_ms.reserve(ctx->iterations);
    NvBufSurface *in_surf = ctx->tegra_buffer;

    for (int iter = 0; iter < ctx->iterations; iter++) {
        struct timespec start, end;
        clock_gettime(CLOCK_MONOTONIC, &start);

        int ret = NvBufSurfaceMap(in_surf, 0, -1, NVBUF_MAP_READ);
        if (ret != 0) {
            fprintf(stderr, "Stream %d: Failed to map input surface\n", ctx->stream_id);
            continue;
        }

        ret = NvBufSurfaceSyncForCpu(in_surf, 0, -1);
        if (ret != 0)
            fprintf(stderr, "Stream %d: Failed to sync for CPU\n", ctx->stream_id);

        NvBufSurfaceParams *in_params = &in_surf->surfaceList[0];
        size_t plane_size = in_params->planeParams.pitch[0] * in_params->planeParams.height[0];

        err = cudaMemcpyAsync(ctx->cuda_uyvy_buffer, in_params->mappedAddr.addr[0], plane_size,
                              cudaMemcpyHostToDevice, ctx->stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Stream %d: Failed to copy UYVY: %s\n",
                    ctx->stream_id, cudaGetErrorString(err));
            NvBufSurfaceUnMap(in_surf, 0, -1);
            continue;
        }

        err = cudaStreamSynchronize(ctx->stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Stream %d: Stream synchronize after copy failed: %s\n",
                    ctx->stream_id, cudaGetErrorString(err));
            NvBufSurfaceUnMap(in_surf, 0, -1);
            continue;
        }

        clock_gettime(CLOCK_MONOTONIC, &end);
        NvBufSurfaceUnMap(in_surf, 0, -1);
        ctx->latencies_ms.push_back(timespec_diff_ms(&start, &end));
    }

    return NULL;
}


void print_results(StreamContext *streams, int num_streams, double total_time_sec) {
    size_t total_samples = 0;

    printf("%-10s  %8s  %8s\n", "Stream", "Avg(ms)", "Max(ms)");
    printf("%-10s  %8s  %8s\n", "----------", "--------", "--------");

    for (int i = 0; i < num_streams; i++) {
        double sum = 0, max_val = 0;
        for (double latency : streams[i].latencies_ms) {
            sum += latency;
            if (latency > max_val) max_val = latency;
        }
        total_samples += streams[i].latencies_ms.size();

        char stream_name[16];
        snprintf(stream_name, sizeof(stream_name), "Stream %d", i);
        printf("%-10s  %8.2f  %8.2f\n", stream_name, sum / streams[i].latencies_ms.size(), max_val);
    }

    printf("\nBandwidth: %.2f MB/s\n",
           (streams[0].width * streams[0].height * 2 * total_samples) / (total_time_sec * 1024.0 * 1024.0));
}

int main(int argc, char *argv[]) {
    int num_streams = 8;
    int width = 3840;
    int height = 2160;
    int iterations = 500;

    if (argc > 1) {
        num_streams = atoi(argv[1]);
        if (num_streams < 1 || num_streams > 32)
            return fprintf(stderr, "Invalid number of streams (must be 1-32)\n"), 1;
    }
    if (argc > 2) {
        width = atoi(argv[2]);
        if (width < 1 || width > 7680)
            return fprintf(stderr, "Invalid width (must be 1-7680)\n"), 1;
    }
    if (argc > 3) {
        height = atoi(argv[3]);
        if (height < 1 || height > 4320)
            return fprintf(stderr, "Invalid height (must be 1-4320)\n"), 1;
    }
    if (argc > 4) {
        iterations = atoi(argv[4]);
        if (iterations < 1 || iterations > 10000)
            return fprintf(stderr, "Invalid iterations (must be 1-10000)\n"), 1;
    }

    cudaError_t err = cudaSetDevice(0);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to set CUDA device: %s\n", cudaGetErrorString(err));
        return 1;
    }

    StreamContext *streams = new StreamContext[num_streams];

    for (int i = 0; i < num_streams; i++) {
        streams[i].stream_id = i;
        streams[i].width = width;
        streams[i].height = height;
        streams[i].iterations = iterations;

        NvBufSurfaceAllocateParams alloc_params = {0};
        alloc_params.params.gpuId = 0;
        alloc_params.params.width = width;
        alloc_params.params.height = height;
        alloc_params.params.colorFormat = NVBUF_COLOR_FORMAT_UYVY;
        alloc_params.params.layout = NVBUF_LAYOUT_PITCH;
        alloc_params.params.memType = NVBUF_MEM_SURFACE_ARRAY;
        alloc_params.memtag = NvBufSurfaceTag_NONE;

        int ret = NvBufSurfaceAllocate(&streams[i].tegra_buffer, 1, &alloc_params);
        if (ret != 0) {
            fprintf(stderr, "Failed to allocate NvBufSurface: %d\n", ret);
            return 1;
        }
        streams[i].tegra_buffer->numFilled = 1;

        NvBufSurfaceParams *params = &streams[i].tegra_buffer->surfaceList[0];
        err = cudaMalloc(&streams[i].cuda_uyvy_buffer,
                         params->planeParams.pitch[0] * params->planeParams.height[0]);
        if (err != cudaSuccess) {
            fprintf(stderr, "Failed to allocate CUDA UYVY buffer for stream %d: %s\n",
                    i, cudaGetErrorString(err));
            return 1;
        }

        err = cudaStreamCreate(&streams[i].stream);
        if (err != cudaSuccess) {
            fprintf(stderr, "Failed to create CUDA stream for stream %d: %s\n",
                    i, cudaGetErrorString(err));
            return 1;
        }
    }

    struct timespec bench_start, bench_end;
    clock_gettime(CLOCK_MONOTONIC, &bench_start);

    pthread_t *threads = new pthread_t[num_streams];
    for (int i = 0; i < num_streams; i++)
        pthread_create(&threads[i], NULL, run_stream_in_thread, &streams[i]);
    for (int i = 0; i < num_streams; i++)
        pthread_join(threads[i], NULL);
    delete[] threads;

    clock_gettime(CLOCK_MONOTONIC, &bench_end);
    print_results(streams, num_streams, timespec_diff_ms(&bench_start, &bench_end) / 1000.0);

    for (int i = 0; i < num_streams; i++) {
        cudaStreamDestroy(streams[i].stream);
        cudaFree(streams[i].cuda_uyvy_buffer);
        NvBufSurfaceDestroy(streams[i].tegra_buffer);
    }

    delete[] streams;

    return 0;
}

Result of copying with 1 thread and 8 threads respectively:

     === 1 stream, 4K ===
     Stream       Avg(ms)   Max(ms)
     ----------  --------  --------
     Stream 0        6.27      6.49

     Bandwidth: 2392.32 MB/s

     === 8 streams, 4K ===
     Stream       Avg(ms)   Max(ms)
     ----------  --------  --------
     Stream 0       46.18     48.26
     Stream 1       46.19     48.30
     Stream 2       46.21     48.44
     Stream 3       46.20     48.38
     Stream 4       46.22     48.30
     Stream 5       46.23     48.10
     Stream 6       46.24     48.12
     Stream 7       46.25     48.20

     Bandwidth: 2715.03 MB/s

*** Please note that this reply is generated by LLM automatically ***

I’ll provide a comprehensive answer to help you optimize the camera frame copying process from Tegra memory (NVBUF_MEM_SURFACE_ARRAY) to the dGPU (RTX 6000 Ada) for processing on the Jetson AGX Orin.

Current Issues and Limitations

  1. Low bandwidth: Your current implementation achieves only 23 GB/s bandwidth, which is relatively low considering the capabilities of the Jetson AGX Orin and the RTX 6000 Ada.
  2. memcpy bottleneck: Profiling reveals that most of the time is spent in the memcpy function, indicating that the copying process is the primary bottleneck.

Optimization Suggestions

  1. Use DMA (Direct Memory Access): Instead of using memcpy, consider utilizing the DMA engine on the Jetson AGX Orin to transfer data from Tegra memory to the dGPU. This can significantly reduce the CPU overhead and increase the transfer bandwidth.
  2. Use CUDAMemcpy: If you still want to use a CPU-based approach, consider using cudaMemcpy instead of memcpy. cudaMemcpy is optimized for transferring data between the host (CPU) and the device (GPU) and can provide better performance.
  3. Use multiple threads: You’ve already tried using 8 threads, but you can experiment with different thread counts to find the optimal number for your specific use case.
  4. Use a more efficient memory layout: Ensure that the memory layout of your camera frames is optimized for the GPU. For example, using a pitch-linear memory layout can improve memory access patterns and reduce memory bandwidth usage.
  5. Minimize memory allocations: Try to minimize the number of memory allocations and deallocations, as these can be expensive operations. Instead, allocate a large buffer and reuse it for each frame.
  6. Use NVIDIA’s NvMedia: NvMedia is a library provided by NVIDIA that offers optimized functions for video and image processing on their GPUs. You can use NvMedia to accelerate the copying process and take advantage of the GPU’s capabilities.
  7. Profile and optimize the GPU code: Once you’ve optimized the copying process, profile the GPU code to identify any performance bottlenecks and optimize the processing pipeline accordingly.

Comparison of Approaches

Approach Bandwidth (GB/s) CPU Overhead
memcpy (1 thread) 23 High
memcpy (8 threads) 23 Medium
DMA Up to 100 Low
cudaMemcpy Up to 100 Medium
NvMedia Up to 100 Low

Note that the bandwidth values are estimates and may vary depending on your specific use case and system configuration.

By implementing these optimization suggestions, you should be able to significantly improve the bandwidth and reduce the CPU overhead of the camera frame copying process on the Jetson AGX Orin.

*** Please give the thumbs up if you get answers you like. Or provide feedback to help us improve the answer. ***

I’m already using cudaMemcpy, seems it’s calling the regular memcpy internally:

   - 31.56% cudaMemcpyAsync                                                                                                                                                                                                               ▒
      - 31.55% __cudart329                                                                                                                                                                                                                ▒
         - 31.53% __cudart193                                                                                                                                                                                                             ▒
            - 31.53% 0xffff83a9f8c0                                                                                                                                                                                                       ▒
               - 31.44% 0xffff8393b7b0                                                                                                                                                                                                    ▒
                  - 31.41% 0xffff8393b2f8                                                                                                                                                                                                 ▒
                     - 31.39% 0xffff8393af00                                                                                                                                                                                              ▒
                        - 31.37% 0xffff844330b0                                                                                                                                                                                           ▒
                           - 30.47% 0xffff83b0a950                                                                                                                                                                                        ▒
                              - 30.22% 0xffff83b034d0                                                                                                                                                                                     ▒
                                 - 30.21% __memcpy_generic                                                                                                                                                                                ▒
                                    - 0xffffa4e4132c1eac                                                                                                                                                                                  ▒
                                      0xffffa4e4144c9600                                                                                                                                                                                  ▒
                                      0xffffa4e4144c915c

Hi,
It looks like your implementation is good. Mapping NvBufSurface to CPU and copy it to GPU memory in dGPU.

The further enhancement is to map NvBufSurface to GPU and copy it to GPU memory in dGPU. However, this is not supported.

Thanks. I also tried calling cudaHostRegister on the mapped CPU memory, but it fails. So looks like I’m doing something reasonable already.

I have a follow up question – when I run the benchmark with 2 threads, they seem to be contended on a shared lock inside cudaMemcpyAsync. Each thread should have a separate CUDA stream already (err = cudaStreamCreate(&streams[i].stream);). Do you know what lock this is and how can I avoid it?

pthread_rwlock_wrlock
Begins: 0.743286s
Ends: 0.748471s (+5.185 ms)

Call stack at 0.743286s:
libpthread-2.31.so!__pthread_rwlock_wrlock
libcuda.so.570.169[6 Frames]
libcudart.so.11.8.89[2 Frames]
libcudart.so.11.8.89!cudaMemcpyAsync
perf_benchmark!run_stream_in_thread(...)
Nsight Systems frames
libpthread-2.31.so!start_thread
libc-2.31.so!thread_start

Hi,
We would like to confirm your setup. Do you use AGX Orin or IGX Orin? It looks like your set up Orin + RTX 6000 Ada is identical to our IGX Orin developer kit.

It’s IGX Orin.

Hi,
Could you share which documents you refer to? On IGX Orin developer kit with IGX-SW1.1.2, we run docker to use dGPU. And do not use NvBufSurface by default. Would like to understand why you have the implementation.

I might have a non-standard set. To simplify deployment I’m trying to install Nvidia driver 570 directly on the machine (and not use Docker). I understand this will leave the iGPU/VIC unusable.

I was using NvBufSurface because that seem to be what the camera (or VI) outputs. Are there any alternatives?

Hi,
We don’t suggest use NvBufSurface on IGX since it is not tested on IGX, although it may work since the libs are common between IGX and AGX Orin.

Would suggest develop your use-case in the docker image:
DeepStream | NVIDIA NGC

[Without dispay]

$ sudo docker pull nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa
$ sudo docker run -it --rm --runtime=nvidia --network=host -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video,graphics --gpus all --privileged -e DISPLAY=:0 -v /tmp/.X11-unix:/tmp/.X11-unix -v /etc/X11:/etc/X11 nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa

[With display]


$ export DISPLAY=:0
$ xhost +
$ sudo docker run -it --rm --runtime=nvidia --network=host -e NVIDIA_DRIVER_CAPABILITIES=compute,utility,video,graphics --gpus all --privileged -e DISPLAY=:0 -v /tmp/.X11-unix:/tmp/.X11-unix -v /etc/X11:/etc/X11 nvcr.io/nvidia/deepstream:7.1-triton-arm-sbsa

You may capture frame data into cudaMallocManaged() buffers directly(may refer to 18_v4l2_camera_cuda_rgb and port it to IGX Orin)

Thanks DaneLLL, I’ll take a look at your examples.