In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for num_workers in [1, 2, 4, 8]: # 4 < 2 for test
        for batch_size in [1, 2, 4, 8, 16, 32]:
            loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
            if mode=='test':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    output = model(data)
            else: # mode=='train':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    target = target.to('cuda', non_blocking=True).long()
                    optimizer.zero_grad()
                    output = model(data)
                    loss = criterion(output, target)
                    loss.backward()
                    optimizer.step()
            tm = time.time() - tm
            print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
            torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 1  BatchSize= 1  Time=33.559s  Imgs/s= 38.14
Mode=test: NumWorkers= 1  BatchSize= 2  Time=16.639s  Imgs/s= 76.93
Mode=test: NumWorkers= 1  BatchSize= 4  Time= 8.817s  Imgs/s=145.17
Mode=test: NumWorkers= 1  BatchSize= 8  Time= 8.802s  Imgs/s=145.41
Mode=test: NumWorkers= 1  BatchSize=16  Time= 9.094s  Imgs/s=140.76
Mode=test: NumWorkers= 1  BatchSize=32  Time= 8.247s  Imgs/s=155.21
Mode=test: NumWorkers= 2  BatchSize= 1  Time=34.151s  Imgs/s= 37.48
Mode=test: NumWorkers= 2  BatchSize= 2  Time=16.366s  Imgs/s= 78.21
Mode=test: NumWorkers= 2  BatchSize= 4  Time= 7.701s  Imgs/s=166.20
Mode=test: NumWorkers= 2  BatchSize= 8  Time= 3.888s  Imgs/s=329.25
Mode=test: NumWorkers= 2  BatchSize=16  Time= 3.824s  Imgs/s=334.75
Mode=test: NumWorkers= 2  BatchSize=32  Time= 3.706s  Imgs/s=345.38
Mode=test: NumWorkers= 4  BatchSize= 1  Time=34.202s  Imgs/s= 37.43
Mode=test: NumWorkers= 4  BatchSize= 2  Time=16.350s  Imgs/s= 78.29
Mode=test: NumWorkers= 4  BatchSize= 4  Time= 7.

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for num_workers in [1, 2, 4, 8, 16]: # 4 < 2 for test
        for batch_size in [40]:
            loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
            if mode=='test':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    output = model(data)
            else: # mode=='train':
                for i, (data, target) in enumerate(loader):
                    if i==1:
                        tm = time.time()
                    data = data.to('cuda', non_blocking=True)
                    target = target.to('cuda', non_blocking=True).long()
                    optimizer.zero_grad()
                    output = model(data)
                    loss = criterion(output, target)
                    loss.backward()
                    optimizer.step()
            tm = time.time() - tm
            print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
            torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 1  BatchSize=40  Time= 7.026s  Imgs/s=182.17
Mode=test: NumWorkers= 2  BatchSize=40  Time= 3.407s  Imgs/s=375.71
Mode=test: NumWorkers= 4  BatchSize=40  Time= 1.752s  Imgs/s=730.46
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.323s  Imgs/s=967.16
Mode=test: NumWorkers=16  BatchSize=40  Time= 1.419s  Imgs/s=901.91


# GPU Comparison

## CUDA train

In [9]:
torch.cuda.get_device_name(0)

'GeForce RTX 2080 Ti'

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'train'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for _ in range (10):
        num_workers = 8
        batch_size = 40
        loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
        if mode=='test':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                output = model(data)
        else: # mode=='train':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                target = target.to('cuda', non_blocking=True).long()
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
        tm = time.time() - tm
        print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
        torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.084s  Imgs/s=251.75
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.076s  Imgs/s=252.17
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.079s  Imgs/s=252.02
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.437s  Imgs/s=235.41
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.157s  Imgs/s=248.20
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.139s  Imgs/s=249.09
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.191s  Imgs/s=246.56
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.304s  Imgs/s=241.34
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.275s  Imgs/s=242.67
Mode=train: NumWorkers= 8  BatchSize=40  Time= 5.202s  Imgs/s=246.06


## CUDA test

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cuda')
    for _ in range (10):
        num_workers = 8
        batch_size = 40
        loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
        if mode=='test':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                output = model(data)
        else: # mode=='train':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cuda', non_blocking=True)
                target = target.to('cuda', non_blocking=True).long()
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
        tm = time.time() - tm
        print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
        torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.418s  Imgs/s=902.44
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.315s  Imgs/s=973.75
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.348s  Imgs/s=949.28
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.363s  Imgs/s=938.86
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.353s  Imgs/s=945.78
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.380s  Imgs/s=927.33
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.365s  Imgs/s=937.47
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.390s  Imgs/s=920.68
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.354s  Imgs/s=945.08
Mode=test: NumWorkers= 8  BatchSize=40  Time= 1.396s  Imgs/s=916.73


## CPU train

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'train'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cpu')
    for _ in range (1):
        num_workers = 8
        batch_size = 10
        loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
        if mode=='test':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cpu', non_blocking=True)
                output = model(data)
        else: # mode=='train':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cpu', non_blocking=True)
                target = target.to('cpu', non_blocking=True).long()
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
        tm = time.time() - tm
        print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
#         torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=train: NumWorkers= 8  BatchSize=10  Time=299.952s  Imgs/s=  4.27


## CPU test

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time

def main():
    mode = 'test'
    model = models.resnet50()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    N = 1280
    dataset = datasets.FakeData(size=N, transform=transforms.ToTensor())
    if mode=='test': # switch to evaluate mode
        model.eval()
    model.to('cpu')
    for _ in range (1):
        num_workers = 8
        batch_size = 10
        loader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=True)
        if mode=='test':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cpu', non_blocking=True)
                output = model(data)
        else: # mode=='train':
            for i, (data, target) in enumerate(loader):
                if i==1:
                    tm = time.time()
                data = data.to('cpu', non_blocking=True)
                target = target.to('cpu', non_blocking=True).long()
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
        tm = time.time() - tm
        print('Mode=%s: NumWorkers=%2d  BatchSize=%2d  Time=%6.3fs  Imgs/s=%6.2f' % (mode, num_workers, batch_size, tm, N/tm))
#         torch.cuda.empty_cache() # doesn't seem to be working...

if __name__ == '__main__':
    main()

Mode=test: NumWorkers= 8  BatchSize=10  Time=103.033s  Imgs/s= 12.42


 ## Comparison

In [8]:
image_increase = ((960/11.9)-1)
time_increase = ((107.520/1.333)-1)
print(f'Image difference: {image_increase:.2%}')
print(f'Time difference: {time_increase:.2%}')

Image difference: 7967.23%
Time difference: 7966.02%
