In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for num_workers in [1, 2, 4, 8]: # 4 < 2 for test
        for batch_size in [1, 2, 4, 8, 16, 32]:
            loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
            if mode=='test':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    output = model(data)
            else: # mode=='train':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    target = target.to('cuda', non_blocking=True).long()
                    optimizer.zero_grad()
                    output = model(data)
                    loss = criterion(output, target)
                    loss.backward()
                    optimizer.step()
            tm = time.time() - tm
            print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
            torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 1  BatchSize= 1  Time=33.559s  Imgs/s= 38.14
Mode=test: NumWorkers= 1  BatchSize= 2  Time=16.639s  Imgs/s= 76.93
Mode=test: NumWorkers= 1  BatchSize= 4  Time= 8.817s  Imgs/s=145.17
Mode=test: NumWorkers= 1  BatchSize= 8  Time= 8.802s  Imgs/s=145.41
Mode=test: NumWorkers= 1  BatchSize=16  Time= 9.094s  Imgs/s=140.76
Mode=test: NumWorkers= 1  BatchSize=32  Time= 8.247s  Imgs/s=155.21
Mode=test: NumWorkers= 2  BatchSize= 1  Time=34.151s  Imgs/s= 37.48
Mode=test: NumWorkers= 2  BatchSize= 2  Time=16.366s  Imgs/s= 78.21
Mode=test: NumWorkers= 2  BatchSize= 4  Time= 7.701s  Imgs/s=166.20
Mode=test: NumWorkers= 2  BatchSize= 8  Time= 3.888s  Imgs/s=329.25
Mode=test: NumWorkers= 2  BatchSize=16  Time= 3.824s  Imgs/s=334.75
Mode=test: NumWorkers= 2  BatchSize=32  Time= 3.706s  Imgs/s=345.38
Mode=test: NumWorkers= 4  BatchSize= 1  Time=34.202s  Imgs/s= 37.43
Mode=test: NumWorkers= 4  BatchSize= 2  Time=16.350s  Imgs/s= 78.29
Mode=test: NumWorkers= 4  BatchSize= 4  Time= 7.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for num_workers in [1, 2, 4, 8, 16]: # 4 < 2 for test
        for batch_size in [40]:
            loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
            if mode=='test':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    output = model(data)
            else: # mode=='train':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    target = target.to('cuda', non_blocking=True).long()
                    optimizer.zero_grad()
                    output = model(data)
                    loss = criterion(output, target)
                    loss.backward()
                    optimizer.step()
            tm = time.time() - tm
            print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
            torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 1  BatchSize=40  Time= 7.026s  Imgs/s=182.17
Mode=test: NumWorkers= 2  BatchSize=40  Time= 3.407s  Imgs/s=375.71
Mode=test: NumWorkers= 4  BatchSize=40  Time= 1.752s  Imgs/s=730.46
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.323s  Imgs/s=967.16
Mode=test: NumWorkers=16  BatchSize=40  Time= 1.419s  Imgs/s=901.91


In [3]:
torch.cuda.empty_cache()

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for _ in range (30):
        num_workers = 8
        batch_size = 40
        loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
        if mode=='test':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                output = model(data)
        else: # mode=='train':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                target = target.to('cuda', non_blocking=True).long()
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
        tm = time.time() - tm
        print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
        torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.509s  Imgs/s=848.26
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.310s  Imgs/s=976.73
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.348s  Imgs/s=949.28
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.324s  Imgs/s=966.43
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.348s  Imgs/s=949.28
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.362s  Imgs/s=939.55
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.415s  Imgs/s=904.46
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.314s  Imgs/s=973.77
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.445s  Imgs/s=885.73
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.417s  Imgs/s=903.18
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.415s  Imgs/s=904.46
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.432s  Imgs/s=893.75
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.553s  Imgs/s=824.29
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.328s  Imgs/s=963.53
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.