gpu_testing/test_gpu.py at main · ahn-ml/gpu_testing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import torch
import torchvision
from torch import nn
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader


# Define the MNIST classification network
class MNISTNet(nn.Module):
    def __init__(self):
        super(MNISTNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = self.fc2(x)
        return x


# Get the number of GPUs in the system
num_gpus = torch.cuda.device_count()

# Load the MNIST dataset
mnist = MNIST(
    '.', train=True, download=True,
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
    ])
)

# Create a data loader for the MNIST dataset
data_loader = DataLoader(mnist, batch_size=128, shuffle=True)

# Iterate over the GPUs and test each one
for i in range(num_gpus):
    try:
        # Test GPU $i by training an MNIST classification network
        device = torch.device(f"cuda:{i}")

        # Move the MNIST dataset and the model to GPU $i
        model = MNISTNet().to(device)
        data_loader = DataLoader(mnist, batch_size=128, shuffle=True, pin_memory=True)

        # Train the MNIST classification network
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
        for epoch in range(10):
            for data, target in data_loader:
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()

    except:
        # GPU $i is not working
        print(f"GPU {i} is not working")