def benchmark(model: nn.Module, # Model to run
int =32, # Batch size
bs: int =None, # Number of batches to run. `seconds` must be None
n_batches: int =None, # Number of seconds to run. `n_batches` must be None
n_seconds: int =False, # Use Automatic Mixed Precision
fp16: int=224, # Mock-train on this size "images"
size: =torch.device("cuda:0"),): # Device to run on
dev: torch.device"""Mock-train the model on random noise input."""
# There can be only one
assert not n_batches or not n_seconds
assert n_batches or n_seconds
=True
torch.backends.cudnn.benchmarkassert torch.backends.cudnn.is_available()
model.to(dev)= torch.optim.SGD(model.parameters(), lr=0.00001, weight_decay=0.00005, momentum=0)
optim
= { k : v.cpu() for k,v in model.state_dict().items() }
state
= torch.randn((bs, 3, size, size), device=dev)
X = torch.randint(0, 999, (bs,), device=dev)
y
if n_batches:
= tqdm(total=n_batches, unit="Batch")
pbar else:
= tqdm(total=n_seconds,
pbar ="{l_bar}{bar}| {n:.1f}/{total} s [{elapsed}<{remaining} {postfix}]")
bar_format
= last_time = 0
start_time for c in count():
model.load_state_dict(state)
with autocast(enabled=fp16):
= model(X)
yhat = F.cross_entropy(yhat, y)
loss
loss.backward()
optim.step()
=time.time()
tt=True)
optim.zero_grad(set_to_none
if not start_time:
= start_time = tt
last_time else:
if n_batches:
pbar.update()# Note: c starts with 0, but we discard the first iteration
if c == n_batches:
break
else:
= tt - last_time
iter_time = tt - start_time
run_time
pbar.update(iter_time)if run_time >= n_seconds:
break
= tt
last_time
pbar.close()
return ((time.time() - start_time), c*bs)
Benchmark
Deep Learning GPU benchmark
Run a standard PyTorch training loop on an image classifier model of your choice with specified batch size and FP16/FP32. The result is the measure of throughput - number of trainig samples per second. It can be synced to Wights & Biases. See more in CLI
Note: The data never leaves the GPU, and the throughput should be mostly independent of the rest of the system, at least for larger batch sizes.
benchmark
benchmark (model:torch.nn.modules.module.Module, bs:int=32, n_batches:int=None, n_seconds:int=None, fp16:int=False, size:int=224, dev:torch.device=device(type='cuda', index=0))
Mock-train the model on random noise input.
Type | Default | Details | |
---|---|---|---|
model | Module | Model to run | |
bs | int | 32 | Batch size |
n_batches | int | None | Number of batches to run. seconds must be None |
n_seconds | int | None | Number of seconds to run. n_batches must be None |
fp16 | int | False | Use Automatic Mixed Precision |
size | int | 224 | Mock-train on this size “images” |
dev | device | cuda:0 | Device to run on |
= timm.create_model("vgg11", pretrained=False)
model =10) benchmark(model, n_seconds
(10.038218975067139, 1888)
=10) benchmark(model, n_batches
(1.6976494789123535, 320)