Ready, Steady, Go!

Benchmark

  • Ready, Steady, Go!
  • Benchmark
  • CLI

Benchmark

Deep Learning GPU benchmark

Run a standard PyTorch training loop on an image classifier model of your choice with specified batch size and FP16/FP32. The result is the measure of throughput - number of trainig samples per second. It can be synced to Wights & Biases. See more in CLI

Note: The data never leaves the GPU, and the throughput should be mostly independent of the rest of the system, at least for larger batch sizes.

def benchmark(model: nn.Module, # Model to run
                bs: int =32,    # Batch size
                n_batches: int =None,  # Number of batches to run. `seconds` must be None
                n_seconds: int =None,  # Number of seconds to run. `n_batches` must be None
                fp16: int =False,      # Use Automatic Mixed Precision
                size: int=224,         # Mock-train on this size "images"
                dev: torch.device=torch.device("cuda:0"),): # Device to run on
    """Mock-train the model on random noise input."""

    # There can be only one
    assert not n_batches or not n_seconds
    assert n_batches or n_seconds


    torch.backends.cudnn.benchmark=True
    assert torch.backends.cudnn.is_available()

    model.to(dev)
    optim = torch.optim.SGD(model.parameters(), lr=0.00001, weight_decay=0.00005, momentum=0)

    state = { k : v.cpu() for k,v in model.state_dict().items() }


    X = torch.randn((bs, 3, size, size), device=dev)
    y = torch.randint(0, 999, (bs,), device=dev)

    if n_batches:
        pbar = tqdm(total=n_batches, unit="Batch")
    else:
        pbar = tqdm(total=n_seconds,
            bar_format="{l_bar}{bar}| {n:.1f}/{total} s [{elapsed}<{remaining} {postfix}]")
    
    start_time = last_time = 0
    for c in count():

        model.load_state_dict(state)

        with autocast(enabled=fp16):
            yhat = model(X)
            loss = F.cross_entropy(yhat, y)

        loss.backward()
        optim.step()

        tt=time.time()
        optim.zero_grad(set_to_none=True)

        if not start_time:
            last_time = start_time = tt
        else:
            if n_batches:
                pbar.update()
                # Note: c starts with 0, but we discard the first iteration
                if c == n_batches:
                    break
            else:
                iter_time =  tt - last_time
                run_time = tt - start_time
                pbar.update(iter_time)
                if run_time >= n_seconds:
                    break
                last_time = tt

    pbar.close()

    return ((time.time() - start_time), c*bs)

source

benchmark

 benchmark (model:torch.nn.modules.module.Module, bs:int=32,
            n_batches:int=None, n_seconds:int=None, fp16:int=False,
            size:int=224, dev:torch.device=device(type='cuda', index=0))

Mock-train the model on random noise input.

Type Default Details
model Module Model to run
bs int 32 Batch size
n_batches int None Number of batches to run. seconds must be None
n_seconds int None Number of seconds to run. n_batches must be None
fp16 int False Use Automatic Mixed Precision
size int 224 Mock-train on this size “images”
dev device cuda:0 Device to run on
model = timm.create_model("vgg11", pretrained=False)
benchmark(model, n_seconds=10)
(10.038218975067139, 1888)
benchmark(model, n_batches=10)
(1.6976494789123535, 320)