I am fetching desktop screenshot frame using DXGI (duplication API) and then converting it to NV12 (Y UV) format and then passing it to nvJPEG for compressing/encoding to jpeg, however the image showing all green foreground with actual image/text in the background
here is my code:
public byte\[\] Encode_V3(Texture2D frame)
{
int w = frame.Description.Width;
int h = frame.Description.Height;
CreateNV12Targets(w, h);
ConvertBGRAtoNV12_GPU(frame, w, h);
RegisterNV12TexturesForCUDA();
var (yPtr, uvPtr) = MapNV12_GPU_SubResource(w, h);
//var (yPtr, uvPtr) = MapNV12_GPU();
var bytes = EncodeNvJPEG(yPtr, uvPtr, w, h);
cudaFree(yPtr);
cudaFree(uvPtr);
// Unregister CUDA resources
CudaInterop.cudaGraphicsUnregisterResource(cuY);
CudaInterop.cudaGraphicsUnregisterResource(cuUV);
// Dispose DirectX resources
yUav?.Dispose();
uvUav?.Dispose();
bgraSrv?.Dispose();
yPlaneTex?.Dispose();
uvPlaneTex?.Dispose();
return bytes;
}
void CreateNV12Targets(int width, int height)
{
yPlaneTex = new Texture2D(\_device, new Texture2DDescription
{
Width = width,
Height = height,
Format = Format.R8_UNorm,
MipLevels = 1,
ArraySize = 1,
SampleDescription = new SampleDescription(1, 0),
BindFlags = BindFlags.UnorderedAccess | BindFlags.ShaderResource,
Usage = ResourceUsage.Default
});
uvPlaneTex = new Texture2D(\_device, new Texture2DDescription
{
Width = width / 2,
Height = height / 2,
Format = Format.R8G8_UNorm,
MipLevels = 1,
ArraySize = 1,
SampleDescription = new SampleDescription(1, 0),
BindFlags = BindFlags.UnorderedAccess | BindFlags.ShaderResource,
Usage = ResourceUsage.Default
});
yUav = new UnorderedAccessView(\_device, yPlaneTex);
uvUav = new UnorderedAccessView(\_device, uvPlaneTex);
}
private const string BGRA_TO_NV12 = @"Texture2D<uint4> src : register(t0);
RWTexture2D Yout : register(u0);
RWTexture2D UVout : register(u1);
// Helper function to compute U and V from a BGRA pixel
float2 ComputeUV(uint4 px)
{
float B = px.x / 255.0f;
float G = px.y / 255.0f;
float R = px.z / 255.0f;
float U = -0.1146f * R - 0.3854f * G + 0.5000f * B + 0.5f;
float V = 0.5000f * R - 0.4542f * G - 0.0458f * B + 0.5f;
return float2(U, V);
}
[numthreads(8, 8, 1)]
void mainCS(uint3 id : SV_DispatchThreadID)
{
uint width, height;
src.GetDimensions(width, height);
if (id.x >= width || id.y >= height)
return;
uint4 px = src[id.xy];
// Compute Y
float R = px.z / 255.0f;
float G = px.y / 255.0f;
float B = px.x / 255.0f;
float Y = 0.2126f * R + 0.7152f * G + 0.0722f * B;
Yout[id.xy] = (uint)(Y * 255.0f);
// Compute UV for top-left of each 2x2 block
if ((id.x & 1) == 0 && (id.y & 1) == 0)
{
uint2 pos00 = id.xy;
uint2 pos01 = uint2(min(id.x, width - 1), min(id.y + 1, height - 1));
uint2 pos10 = uint2(min(id.x + 1, width - 1), min(id.y, height - 1));
uint2 pos11 = uint2(min(id.x + 1, width - 1), min(id.y + 1, height - 1));
float2 uv00 = ComputeUV(src[pos00]);
float2 uv01 = ComputeUV(src[pos01]);
float2 uv10 = ComputeUV(src[pos10]);
float2 uv11 = ComputeUV(src[pos11]);
float Uavg = (uv00.x + uv01.x + uv10.x + uv11.x) * 0.25f;
float Vavg = (uv00.y + uv01.y + uv10.y + uv11.y) * 0.25f;
UVout[id.xy / 2] = uint2(Uavg * 255.0f, Vavg * 255.0f);
}
}";
private ComputeShader LoadBGRAtoNV12CS(SharpDX.Direct3D11.Device device)
{
//UV_WRITE_TEST_SHADER showing 70% green and 30% blue with gradient shift to blue from green
var bytecode = SharpDX.D3DCompiler.ShaderBytecode.Compile(
BGRA_TO_NV12,//BGRAtoNV12_HLSL,
"mainCS",
"cs_5_0",
SharpDX.D3DCompiler.ShaderFlags.OptimizationLevel3
);
return new ComputeShader(device, bytecode.Bytecode);
}
private void ConvertBGRAtoNV12_GPU(Texture2D frame, int width, int height)
{
//var shaderBytes = File.ReadAllBytes("BGRAtoNV12.cso");
//bgraToNv12CS = new ComputeShader(_device, shaderBytes);
if (bgraToNv12CS == null)
bgraToNv12CS = LoadBGRAtoNV12CS(_device);
bgraSrv = new ShaderResourceView(_device, frame);
_device.ImmediateContext.ComputeShader.Set(bgraToNv12CS);
_device.ImmediateContext.ComputeShader.SetShaderResource(0, bgraSrv);
_device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, yUav);
_device.ImmediateContext.ComputeShader.SetUnorderedAccessView(1, uvUav);
int gx = (width + 7) / 8;
int gy = (height + 7) / 8;
_device.ImmediateContext.Dispatch(gx, gy, 1);
// Cleanup
_device.ImmediateContext.ComputeShader.Set(null);
_device.ImmediateContext.ComputeShader.SetShaderResource(0, null);
_device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, null);
_device.ImmediateContext.ComputeShader.SetUnorderedAccessView(1, null);
}
private byte[] EncodeNvJPEG(IntPtr yPtr, IntPtr uvPtr, int w, int h)
{
NvjpegImage img = new NvjpegImage
{
channel = new[] { yPtr, uvPtr, IntPtr.Zero },
pitch = new[] { (UIntPtr)w, (UIntPtr)w, UIntPtr.Zero }
};
LocalState st = localStates[0];
int result = nvjpegEncode(
nvjpegHandle,
st.encodeState,
encoderParams,
ref img,
(int)nvjpegChromaSubsampling_t.NVJPEG_CSS_420,
(int)nvjpegInputFormat_t.NVJPEG_INPUT_NV12,
w, h,
st.cudaStream);
if (result != 0)
throw new Exception($"nvJPEG encode failed: {result}");
result = CudaInterop.cudaStreamSynchronize(st.cudaStream);
if (result != 0)
{
throw new Exception($"cuda sync failed: {result}");
}
// Get JPEG size
UIntPtr jpegSize = UIntPtr.Zero;
result = nvjpegEncodeRetrieveBitstream(nvjpegHandle, st.encodeState, IntPtr.Zero, ref jpegSize, st.cudaStream);
if (result != 0)
throw new Exception("Failed to get JPEG size");
byte[] buf = new byte[(int)jpegSize];
var handle = GCHandle.Alloc(buf, GCHandleType.Pinned);
try
{
result = nvjpegEncodeRetrieveBitstream(
nvjpegHandle,
st.encodeState,
handle.AddrOfPinnedObject(),
ref jpegSize,
st.cudaStream);
if (result != 0)
throw new Exception("Failed to retrieve JPEG bitstream");
}
catch (Exception e) { }
finally
{
handle.Free();
}
return buf;
}
private void RegisterNV12TexturesForCUDA()
{
uint flags = 0x0; // No special flags; can use cudaGraphicsRegisterFlagsNone = 0
// Register Y plane
int resY = CudaInterop.cudaGraphicsD3D11RegisterResource(out cuY, yPlaneTex.NativePointer, flags);
if (resY != 0)
throw new Exception($"Failed to register Y plane with CUDA: {resY}");
// Register UV plane
int resUV = CudaInterop.cudaGraphicsD3D11RegisterResource(out cuUV, uvPlaneTex.NativePointer, flags);
if (resUV != 0)
throw new Exception($"Failed to register UV plane with CUDA: {resUV}");
}
private (IntPtr yPtr, IntPtr uvPtr) MapNV12_GPU()
{
IntPtr[] arr = { cuY, cuUV };
int mapRes = CudaInterop.cudaGraphicsMapResources((uint)arr.Length, arr, _cudaStream);
if (mapRes != 0)
throw new Exception($"Failed to map NV12 textures: {mapRes}");
IntPtr yCudaPtr, uvCudaPtr;
UIntPtr yCudaPtrSize, uvCudaPtrSize;
int err = CudaInterop.cudaGraphicsResourceGetMappedPointer(out yCudaPtr, out yCudaPtrSize, cuY);
if (err != 0)
throw new Exception($"cudaGraphicsResourceGetMappedPointer failed: {err}");
err = CudaInterop.cudaGraphicsResourceGetMappedPointer(out uvCudaPtr, out uvCudaPtrSize, cuUV);
if (err != 0)
throw new Exception($"cudaGraphicsResourceGetMappedPointer failed: {err}");
return (yCudaPtr, uvCudaPtr);
}
private (IntPtr yArray, IntPtr uvArray) MapNV12_GPU_SubResource(int w, int h)
{
IntPtr[] resources = { cuY, cuUV };
// Map both resources for CUDA access
int mapRes = CudaInterop.cudaGraphicsMapResources((uint)resources.Length, resources, _cudaStream);
if (mapRes != 0)
throw new Exception($"Failed to map NV12 textures: {mapRes}");
// Get the mapped CUDA arrays for each texture
IntPtr yCudaArray, uvCudaArray;
int err;
err = CudaInterop.cudaGraphicsSubResourceGetMappedArray(out yCudaArray, cuY, 0, 0);
if (err != 0)
throw new Exception($"Failed to get mapped array for Y plane: {err}");
err = CudaInterop.cudaGraphicsSubResourceGetMappedArray(out uvCudaArray, cuUV, 0, 0);
if (err != 0)
throw new Exception($"Failed to get mapped array for UV plane: {err}");
// Assume you have mapped arrays yArray and uvArray (cudaArray_t)
IntPtr d_yLinear = IntPtr.Zero, d_uvLinear = IntPtr.Zero;
int cudaMemcpyDeviceToDevice = 3;
cudaMalloc(out d_yLinear, (UIntPtr)(w * h)); // Y plane
cudaMalloc(out d_uvLinear, (UIntPtr)(w * h / 2)); // UV plane
//// Copy Y plane from cudaArray_t to linear device memory
//err = CudaInterop.cudaMemcpy2DFromArray(
// d_yLinear, (UIntPtr)(w),
// yCudaArray, UIntPtr.Zero, UIntPtr.Zero,
// (UIntPtr)w, (UIntPtr)h,
// cudaMemcpyDeviceToDevice
//);
UIntPtr yPitch = (UIntPtr)w; // pitch = width bytes, 1 byte per pixel
UIntPtr yWidthBytes = (UIntPtr)w;
UIntPtr yHeight = (UIntPtr)h;
err = CudaInterop.cudaMemcpy2DFromArray(
d_yLinear,
yPitch,
yCudaArray,
UIntPtr.Zero,
UIntPtr.Zero,
yWidthBytes,
yHeight,
cudaMemcpyDeviceToDevice);
if (err != 0)
{
throw new Exception($"Failed to copy from cudaArray to device buffer for Y plane: {err}");
}
//// Copy UV plane
//err = CudaInterop.cudaMemcpy2DFromArray(
// d_uvLinear, (UIntPtr)(w),
// uvCudaArray, UIntPtr.Zero, UIntPtr.Zero,
// (UIntPtr)w, (UIntPtr)(h / 2),
// cudaMemcpyDeviceToDevice
//);
UIntPtr uvPitch = (UIntPtr)(w); // NV12 UV pitch = same as Y plane width in bytes
UIntPtr uvWidthBytes = (UIntPtr)(w); // width bytes for UV plane because 2 bytes per pixel and width is halved
UIntPtr uvHeight = (UIntPtr)(h / 2);
err = CudaInterop.cudaMemcpy2DFromArray(
d_uvLinear,
uvPitch,
uvCudaArray,
UIntPtr.Zero,
UIntPtr.Zero,
uvWidthBytes,
uvHeight,
cudaMemcpyDeviceToDevice);
if (err != 0)
{
throw new Exception($"Failed to copy from cudaArray to device buffer for UV plane: {err}");
}
// Unmap resources immediately
CudaInterop.cudaGraphicsUnmapResources(resources.Length, resources, _cudaStream);
return (d_yLinear, d_uvLinear);
}
