nvJPEG is encoder is not compressing correctly

I am fetching desktop screenshot frame using DXGI (duplication API) and then converting it to NV12 (Y UV) format and then passing it to nvJPEG for compressing/encoding to jpeg, however the image showing all green foreground with actual image/text in the background
here is my code:

public byte\[\] Encode_V3(Texture2D frame)
{
    int w = frame.Description.Width;
    int h = frame.Description.Height;

    CreateNV12Targets(w, h);

    ConvertBGRAtoNV12_GPU(frame, w, h);

    RegisterNV12TexturesForCUDA();
    var (yPtr, uvPtr) = MapNV12_GPU_SubResource(w, h);
    //var (yPtr, uvPtr) = MapNV12_GPU();

    var bytes = EncodeNvJPEG(yPtr, uvPtr, w, h);

    cudaFree(yPtr);
    cudaFree(uvPtr);

    // Unregister CUDA resources
    CudaInterop.cudaGraphicsUnregisterResource(cuY);
    CudaInterop.cudaGraphicsUnregisterResource(cuUV);

    // Dispose DirectX resources
    yUav?.Dispose();
    uvUav?.Dispose();
    bgraSrv?.Dispose();
    yPlaneTex?.Dispose();
    uvPlaneTex?.Dispose();


    return bytes;
}

void CreateNV12Targets(int width, int height)
{
    yPlaneTex = new Texture2D(\_device, new Texture2DDescription
    {
        Width = width,
        Height = height,
        Format = Format.R8_UNorm,
        MipLevels = 1,
        ArraySize = 1,
        SampleDescription = new SampleDescription(1, 0),
        BindFlags = BindFlags.UnorderedAccess | BindFlags.ShaderResource,
        Usage = ResourceUsage.Default
    });

    uvPlaneTex = new Texture2D(\_device, new Texture2DDescription
    {
        Width = width / 2,
        Height = height / 2,
        Format = Format.R8G8_UNorm,
        MipLevels = 1,
        ArraySize = 1,
        SampleDescription = new SampleDescription(1, 0),
        BindFlags = BindFlags.UnorderedAccess | BindFlags.ShaderResource,
        Usage = ResourceUsage.Default
    });

    yUav = new UnorderedAccessView(\_device, yPlaneTex);
    uvUav = new UnorderedAccessView(\_device, uvPlaneTex);
}

private const string BGRA_TO_NV12 = @"Texture2D<uint4> src : register(t0);

RWTexture2D Yout : register(u0);
RWTexture2D UVout : register(u1);

// Helper function to compute U and V from a BGRA pixel
float2 ComputeUV(uint4 px)
{
float B = px.x / 255.0f;
float G = px.y / 255.0f;
float R = px.z / 255.0f;

float U = -0.1146f * R - 0.3854f * G + 0.5000f * B + 0.5f;
float V =  0.5000f * R - 0.4542f * G - 0.0458f * B + 0.5f;

return float2(U, V);

}

[numthreads(8, 8, 1)]
void mainCS(uint3 id : SV_DispatchThreadID)
{
uint width, height;
src.GetDimensions(width, height);

if (id.x >= width || id.y >= height)
    return;

uint4 px = src[id.xy];

// Compute Y
float R = px.z / 255.0f;
float G = px.y / 255.0f;
float B = px.x / 255.0f;
float Y = 0.2126f * R + 0.7152f * G + 0.0722f * B;

Yout[id.xy] = (uint)(Y * 255.0f);

// Compute UV for top-left of each 2x2 block
if ((id.x & 1) == 0 && (id.y & 1) == 0)
{
    uint2 pos00 = id.xy;
    uint2 pos01 = uint2(min(id.x, width - 1), min(id.y + 1, height - 1));
    uint2 pos10 = uint2(min(id.x + 1, width - 1), min(id.y, height - 1));
    uint2 pos11 = uint2(min(id.x + 1, width - 1), min(id.y + 1, height - 1));

    float2 uv00 = ComputeUV(src[pos00]);
    float2 uv01 = ComputeUV(src[pos01]);
    float2 uv10 = ComputeUV(src[pos10]);
    float2 uv11 = ComputeUV(src[pos11]);

    float Uavg = (uv00.x + uv01.x + uv10.x + uv11.x) * 0.25f;
    float Vavg = (uv00.y + uv01.y + uv10.y + uv11.y) * 0.25f;

    UVout[id.xy / 2] = uint2(Uavg * 255.0f, Vavg * 255.0f);
}

}";

    private ComputeShader LoadBGRAtoNV12CS(SharpDX.Direct3D11.Device device)
    {
        //UV_WRITE_TEST_SHADER showing 70% green and 30% blue with gradient shift to blue from green
        var bytecode = SharpDX.D3DCompiler.ShaderBytecode.Compile(
            BGRA_TO_NV12,//BGRAtoNV12_HLSL,
            "mainCS",
            "cs_5_0",
            SharpDX.D3DCompiler.ShaderFlags.OptimizationLevel3
        );

        return new ComputeShader(device, bytecode.Bytecode);
    }

    private void ConvertBGRAtoNV12_GPU(Texture2D frame, int width, int height)
    {
        //var shaderBytes = File.ReadAllBytes("BGRAtoNV12.cso");
        //bgraToNv12CS = new ComputeShader(_device, shaderBytes);

        if (bgraToNv12CS == null)
            bgraToNv12CS = LoadBGRAtoNV12CS(_device);

        bgraSrv = new ShaderResourceView(_device, frame);

        _device.ImmediateContext.ComputeShader.Set(bgraToNv12CS);
        _device.ImmediateContext.ComputeShader.SetShaderResource(0, bgraSrv);
        _device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, yUav);
        _device.ImmediateContext.ComputeShader.SetUnorderedAccessView(1, uvUav);

        int gx = (width + 7) / 8;
        int gy = (height + 7) / 8;
        _device.ImmediateContext.Dispatch(gx, gy, 1);

        // Cleanup
        _device.ImmediateContext.ComputeShader.Set(null);
        _device.ImmediateContext.ComputeShader.SetShaderResource(0, null);
        _device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, null);
        _device.ImmediateContext.ComputeShader.SetUnorderedAccessView(1, null);
    }

    private byte[] EncodeNvJPEG(IntPtr yPtr, IntPtr uvPtr, int w, int h)
    {
        NvjpegImage img = new NvjpegImage
        {
            channel = new[] { yPtr, uvPtr, IntPtr.Zero },
            pitch = new[] { (UIntPtr)w, (UIntPtr)w, UIntPtr.Zero }
        };

        LocalState st = localStates[0];

        int result = nvjpegEncode(
            nvjpegHandle,
            st.encodeState,
            encoderParams,
            ref img,
            (int)nvjpegChromaSubsampling_t.NVJPEG_CSS_420,
            (int)nvjpegInputFormat_t.NVJPEG_INPUT_NV12,
            w, h,
            st.cudaStream);

        if (result != 0)
            throw new Exception($"nvJPEG encode failed: {result}");

        result = CudaInterop.cudaStreamSynchronize(st.cudaStream);
        if (result != 0)
        {
            throw new Exception($"cuda sync failed: {result}");
        }
        // Get JPEG size
        UIntPtr jpegSize = UIntPtr.Zero;
        result = nvjpegEncodeRetrieveBitstream(nvjpegHandle, st.encodeState, IntPtr.Zero, ref jpegSize, st.cudaStream);
        if (result != 0)
            throw new Exception("Failed to get JPEG size");

        byte[] buf = new byte[(int)jpegSize];
        var handle = GCHandle.Alloc(buf, GCHandleType.Pinned);

        try
        {
            result = nvjpegEncodeRetrieveBitstream(
                nvjpegHandle,
                st.encodeState,
                handle.AddrOfPinnedObject(),
                ref jpegSize,
                st.cudaStream);

            if (result != 0)
                throw new Exception("Failed to retrieve JPEG bitstream");
        }
        catch (Exception e) { }
        finally
        {
            handle.Free();
        }

        return buf;
    }

    private void RegisterNV12TexturesForCUDA()
    {
        uint flags = 0x0; // No special flags; can use cudaGraphicsRegisterFlagsNone = 0

        // Register Y plane
        int resY = CudaInterop.cudaGraphicsD3D11RegisterResource(out cuY, yPlaneTex.NativePointer, flags);
        if (resY != 0)
            throw new Exception($"Failed to register Y plane with CUDA: {resY}");

        // Register UV plane
        int resUV = CudaInterop.cudaGraphicsD3D11RegisterResource(out cuUV, uvPlaneTex.NativePointer, flags);
        if (resUV != 0)
            throw new Exception($"Failed to register UV plane with CUDA: {resUV}");
    }

    private (IntPtr yPtr, IntPtr uvPtr) MapNV12_GPU()
    {
        IntPtr[] arr = { cuY, cuUV };
        int mapRes = CudaInterop.cudaGraphicsMapResources((uint)arr.Length, arr, _cudaStream);
        if (mapRes != 0)
            throw new Exception($"Failed to map NV12 textures: {mapRes}");

        IntPtr yCudaPtr, uvCudaPtr;
        UIntPtr yCudaPtrSize, uvCudaPtrSize;
        int err = CudaInterop.cudaGraphicsResourceGetMappedPointer(out yCudaPtr, out yCudaPtrSize, cuY);
        if (err != 0)
            throw new Exception($"cudaGraphicsResourceGetMappedPointer failed: {err}");

        err = CudaInterop.cudaGraphicsResourceGetMappedPointer(out uvCudaPtr, out uvCudaPtrSize, cuUV);
        if (err != 0)
            throw new Exception($"cudaGraphicsResourceGetMappedPointer failed: {err}");

        return (yCudaPtr, uvCudaPtr);
    }

    private (IntPtr yArray, IntPtr uvArray) MapNV12_GPU_SubResource(int w, int h)
    {
        IntPtr[] resources = { cuY, cuUV };

        // Map both resources for CUDA access
        int mapRes = CudaInterop.cudaGraphicsMapResources((uint)resources.Length, resources, _cudaStream);
        if (mapRes != 0)
            throw new Exception($"Failed to map NV12 textures: {mapRes}");

        // Get the mapped CUDA arrays for each texture
        IntPtr yCudaArray, uvCudaArray;
        int err;

        err = CudaInterop.cudaGraphicsSubResourceGetMappedArray(out yCudaArray, cuY, 0, 0);
        if (err != 0)
            throw new Exception($"Failed to get mapped array for Y plane: {err}");

        err = CudaInterop.cudaGraphicsSubResourceGetMappedArray(out uvCudaArray, cuUV, 0, 0);
        if (err != 0)
            throw new Exception($"Failed to get mapped array for UV plane: {err}");

        // Assume you have mapped arrays yArray and uvArray (cudaArray_t)
        IntPtr d_yLinear = IntPtr.Zero, d_uvLinear = IntPtr.Zero;
        int cudaMemcpyDeviceToDevice = 3;

        cudaMalloc(out d_yLinear, (UIntPtr)(w * h));          // Y plane
        cudaMalloc(out d_uvLinear, (UIntPtr)(w * h / 2));    // UV plane

        //// Copy Y plane from cudaArray_t to linear device memory
        //err = CudaInterop.cudaMemcpy2DFromArray(
        //    d_yLinear, (UIntPtr)(w),
        //    yCudaArray, UIntPtr.Zero, UIntPtr.Zero,
        //    (UIntPtr)w, (UIntPtr)h,
        //    cudaMemcpyDeviceToDevice
        //);

        UIntPtr yPitch = (UIntPtr)w;  // pitch = width bytes, 1 byte per pixel
        UIntPtr yWidthBytes = (UIntPtr)w;
        UIntPtr yHeight = (UIntPtr)h;

        err = CudaInterop.cudaMemcpy2DFromArray(
            d_yLinear,
            yPitch,
            yCudaArray,
            UIntPtr.Zero,
            UIntPtr.Zero,
            yWidthBytes,
            yHeight,
            cudaMemcpyDeviceToDevice);

        if (err != 0)
        {
            throw new Exception($"Failed to copy from cudaArray to device buffer for Y plane: {err}");
        }

        //// Copy UV plane
        //err = CudaInterop.cudaMemcpy2DFromArray(
        //    d_uvLinear, (UIntPtr)(w),
        //    uvCudaArray, UIntPtr.Zero, UIntPtr.Zero,
        //    (UIntPtr)w, (UIntPtr)(h / 2),
        //    cudaMemcpyDeviceToDevice
        //);

        UIntPtr uvPitch = (UIntPtr)(w);  // NV12 UV pitch = same as Y plane width in bytes
        UIntPtr uvWidthBytes = (UIntPtr)(w); // width bytes for UV plane because 2 bytes per pixel and width is halved
        UIntPtr uvHeight = (UIntPtr)(h / 2);

        err = CudaInterop.cudaMemcpy2DFromArray(
            d_uvLinear,
            uvPitch,
            uvCudaArray,
            UIntPtr.Zero,
            UIntPtr.Zero,
            uvWidthBytes,
            uvHeight,
            cudaMemcpyDeviceToDevice);

        if (err != 0)
        {
            throw new Exception($"Failed to copy from cudaArray to device buffer for UV plane: {err}");
        }

        // Unmap resources immediately
        CudaInterop.cudaGraphicsUnmapResources(resources.Length, resources, _cudaStream);

        return (d_yLinear, d_uvLinear);
    }