Train a PyTorch Model on a GPU
Run your training script on a GPU in the cloud with one line of code
Introduction#
GPUs can provide impressive performance boosts for certain workflows like training ML models, computer vision, analytics, and more. However GPU hardware can also be difficult to access and set up properly. Here we'll show how to run a Python script on a GPU-enabled cloud machine with one line of code from your laptop.
The calculation processes a PyTorch neural network training task in 14.1 minutes, costing around $0.12. You can run it right now. You'll need the following packages:
pip install torch torchvision coiled
pip install torch torchvision coiled
Full code#
Copy and paste the following code into a file called train.py
. If you're new to Coiled, this will run for free on our account.
# train.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
]
)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 2_500, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(2_500, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# Select what hardware to use
if torch.cuda.is_available():
device = torch.device("cuda") # NVIDIA GPU
elif torch.backends.mps.is_available():
device = torch.device("mps") # Apple silicon GPU
else:
device = torch.device("cpu") # CPU
print(f"{device = }")
model = Net()
model = model.to(device)
# Use built-in PyTorch dataloaders for simplicity.
# In practice, most users load training data from cloud storage.
trainset = torchvision.datasets.CIFAR10(
root="./data",
train=True,
download=True,
transform=transform,
)
trainloader = torch.utils.data.DataLoader(
trainset,
batch_size=400,
shuffle=True,
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# Train model on 10 passes over the data
for epoch in range(10):
print(f"Epoch {epoch}")
for i, data in enumerate(trainloader, 0):
inputs = data[0].to(device)
labels = data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
torch.save(model, "model.pt") # Save to file on cloud VM
# train.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
]
)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 2_500, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(2_500, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# Select what hardware to use
if torch.cuda.is_available():
device = torch.device("cuda") # NVIDIA GPU
elif torch.backends.mps.is_available():
device = torch.device("mps") # Apple silicon GPU
else:
device = torch.device("cpu") # CPU
print(f"{device = }")
model = Net()
model = model.to(device)
# Use built-in PyTorch dataloaders for simplicity.
# In practice, most users load training data from cloud storage.
trainset = torchvision.datasets.CIFAR10(
root="./data",
train=True,
download=True,
transform=transform,
)
trainloader = torch.utils.data.DataLoader(
trainset,
batch_size=400,
shuffle=True,
)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# Train model on 10 passes over the data
for epoch in range(10):
print(f"Epoch {epoch}")
for i, data in enumerate(trainloader, 0):
inputs = data[0].to(device)
labels = data[1].to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
torch.save(model, "model.pt") # Save to file on cloud VM
Then run the following command to train the model on a GPU:
coiled batch run --gpu python train.py
coiled batch run --gpu python train.py
After you've run it, we'll dig into what actually happened there, section by section.
The Dataset#
We're using the CIFAR10 dataset, a common benchmark dataset in computer vision. It contains 60,000 32x32 color images in 10 different classes. The dataset is automatically downloaded when you run the script.
Hardware Selection#
PyTorch makes it straightforward to use different kinds of accelerated hardware. Our code automatically selects the best available hardware:
if torch.cuda.is_available():
device = torch.device("cuda") # NVIDIA GPU
elif torch.backends.mps.is_available():
device = torch.device("mps") # Apple silicon GPU
else:
device = torch.device("cpu") # CPU
if torch.cuda.is_available():
device = torch.device("cuda") # NVIDIA GPU
elif torch.backends.mps.is_available():
device = torch.device("mps") # Apple silicon GPU
else:
device = torch.device("cpu") # CPU
Running on the Cloud#
To run this on a cloud GPU, save the code as train.py
and run:
coiled batch run --gpu python train.py
coiled batch run --gpu python train.py
This command handles:
- Provisioning a cloud VM with GPU hardware (g4dn.xlarge with Tesla T4)
- Setting up NVIDIA drivers and CUDA runtime
- Installing your local packages on the cloud VM
- Installing the CUDA-compiled version of PyTorch
- Running your script
Performance Comparison#
Here's how the training time compares across different hardware:
- Macbook Pro CPU: ~6.8 hours
- Apple M1 GPU: ~1.4 hours (4.9x speedup)
- Cloud NVIDIA T4: ~14.1 minutes (29x speedup)
Giving us our final image:
Next Steps#
Here are some ways you could extend this example:
- Experiment with different neural network architectures to improve model accuracy
- Use a larger or more complex dataset like ImageNet
- Try different GPU instance types to compare performance and cost tradeoffs with the --vm-type keyword
Get started
Know Python? Come use the cloud. Your first 10,000 CPU-hours per month are on us.
$ pip install coiled
$ coiled quickstart
Grant cloud access? (Y/n): Y
... Configuring ...
You're ready to go. 🎉
$ pip install coiled
$ coiled quickstart
Grant cloud access? (Y/n): Y
... Configuring ...
You're ready to go. 🎉