To ensure that the local GPU(s) are in good shape, I’ve run the following parallel processing and it works as expected. So, when running the docker with --gpu all showed the above inactive gpu:2 seems like a bug to me of NIM.
import torch
from concurrent.futures import ThreadPoolExecutor
from torchvision.models import efficientnet_v2_l
def process_sample_on_gpu(gpu_id, sample):
torch.cuda.set_device(gpu_id)
model = get_heavy_model()
model = model.to(f"cuda:{gpu_id}")
model.eval()
sample = sample.to(f"cuda:{gpu_id}")
with torch.no_grad():
output = model(sample)
print(f"Processed sample on GPU {gpu_id} with output shape: {output.shape}")
return output
def get_heavy_model():
model = efficientnet_v2_l(pretrained=False)
return model
def generate_large_sample(batch_size=20, channels=3, height=1024*2, width=1024*2):
return torch.randn(batch_size, channels, height, width)
samples = [generate_large_sample() for _ in range(3)]
gpu_ids = [0, 1, 2]
with ThreadPoolExecutor(max_workers=len(gpu_ids)) as executor:
futures = [
executor.submit(process_sample_on_gpu, gpu_id, sample)
for gpu_id, sample in zip(gpu_ids, samples)
]
results = [future.result() for future in futures]
print("All samples processed.")