Batch processing using NVIDIA NIM | Docker | Self-hosted

To ensure that the local GPU(s) are in good shape, I’ve run the following parallel processing and it works as expected. So, when running the docker with --gpu all showed the above inactive gpu:2 seems like a bug to me of NIM.

import torch
from concurrent.futures import ThreadPoolExecutor
from torchvision.models import efficientnet_v2_l

def process_sample_on_gpu(gpu_id, sample):
    torch.cuda.set_device(gpu_id)
    model = get_heavy_model()  
    model = model.to(f"cuda:{gpu_id}")
    model.eval() 
    sample = sample.to(f"cuda:{gpu_id}")
    
    with torch.no_grad():
        output = model(sample)
    
    print(f"Processed sample on GPU {gpu_id} with output shape: {output.shape}")
    return output

def get_heavy_model():
    model = efficientnet_v2_l(pretrained=False)
    return model

def generate_large_sample(batch_size=20, channels=3, height=1024*2, width=1024*2):
    return torch.randn(batch_size, channels, height, width)

samples = [generate_large_sample() for _ in range(3)]
gpu_ids = [0, 1, 2]

with ThreadPoolExecutor(max_workers=len(gpu_ids)) as executor:
    futures = [
        executor.submit(process_sample_on_gpu, gpu_id, sample)
        for gpu_id, sample in zip(gpu_ids, samples)
    ]
    results = [future.result() for future in futures]

print("All samples processed.")