Experiment tracking is a core component of MLOps platforms, enabling full traceability and provenance of your data, code, and models. In this notebook demonstration, we will explore the features of our platform by creating two experiments. Each experiment consists of four runs, where a convolutional neural network is trained to classify MNIST digits with varying training hyperparameters. We we schedule the runs in the second experiment to run in parallel in a Kubernetes cluster in the cloud.
All experiments, runs, parameters, metrics, time-series metrics, and model artifacts are tracked and stored remotely where they can be retrieved and analyzed afterwards by you or other team members.
Overview
For simplicity, each experiment will use the same function for training, which is defined in this notebook in the cell below with the experiment.protocol decorator.
Notebook Outline
First, we create the experiment and define the function that will be executed when a Run is started in this experiment.
This function will be stored in object storage so that it can be used whenever a Run in this experiment is executed (e.g. on a remote machine)
The file will be stored in
s3://{guidepad_instance_name}/experiments/{experiment._id}/function_artifacts/{function_name}.py
Notice in the function train
below, we utilize methods on the Run class, including Run.log_time_series_metrics()
after each training epoch, Run.log_metrics()
after evaluation on the test set, and Run.log_artifact()
to upload the model to object storage.
Additionally, the train
function uses parameters passed into the params
argument, including param.batch_size
, params.lr
, params.dropout_rate
, and params.epochs
.
We also have access to params.run_id
and params.device
.
All of the parameters and metrics you use in your training function are customizable and can be tracked.
experiment_name = 'tracking-pytorch-local'
experiment = Tracker.create_experiment(name=experiment_name)
print(f'experiment id: {experiment._id}')
@experiment.protocol
def train(params):
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import guidepad
guidepad.initialize()
from guidepad_ml.tracking.types import Run
run = Run.list_single({'_id': params.run_id})
# create dataloaders for train/validation/test
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = MNIST(root='./data', train=False, download=True, transform=transform)
train_proportion = 0.75
train_size = int(train_proportion * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
train_dataset, [train_size, val_size]
)
train_loader = DataLoader(dataset=train_dataset, batch_size=int(params.batch_size), shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=int(params.batch_size), shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=int(params.batch_size), shuffle=False)
# define model architecture
class CNN(nn.Module):
def __init__(self, dropout_rate=0.4):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
self.dropout = nn.Dropout(dropout_rate)
self.fc1 = nn.Linear(800, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
model = CNN(dropout_rate=float(params.dropout_rate)).to(params.device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=float(params.lr))
def train_epoch(model, train_loader, criterion, optimizer, epoch, device):
model.train()
train_loss = 0
correct = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
accuracy = correct / len(train_loader.dataset)
return train_loss / len(train_loader), accuracy
def validate(model, val_loader):
model.eval()
val_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
data, target = data.to(params.device), target.to(params.device)
output = model(data)
val_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
val_loss /= len(val_loader)
accuracy = correct / len(val_loader.dataset)
return val_loss, accuracy
def test(model, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(params.device), target.to(params.device)
output = model(data)
test_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader)
accuracy = correct / len(test_loader.dataset)
return test_loss, accuracy
# train model
for epoch in range(int(params.epochs)):
train_loss, train_accuracy = train_epoch(model, train_loader, criterion, optimizer, epoch, device=params.device)
val_loss, val_accuracy = validate(model, val_loader)
metrics = {
'validation accuracy': val_accuracy,
'validation loss': val_loss,
'train loss': train_loss,
'train accuracy': train_accuracy
}
# log metrics after each epoch
print(f'Epoch {epoch} Train Loss: {round(train_loss, 6)}, Train Accuracy: {round(train_accuracy, 4)}, Validation Loss: {round(val_loss, 6)}, Validation Accuracy: {round(val_accuracy, 4)}')
run = run.log_time_series_metrics(metrics, step=epoch)
model.eval()
test_loss, accuracy = test(model, test_loader)
print(f'Run {params.run_id} Test Loss: {round(test_loss, 6)}, Test Accuracy: {accuracy}')
run = run.log_metrics({'test accuracy': accuracy, 'test loss': test_loss})
filename = 'model.pth'
model_dir = f'models/{params.run_id}'
model_filepath = os.path.join(model_dir, filename)
os.makedirs(model_dir, exist_ok=True)
# we will need the CNN class to load the model
torch.save(model.state_dict(), model_filepath)
# If we wanted, we could only log the model if the test accuracy is above a certain threshold
run = run.log_model_artifact(filepath=model_filepath)
return {'message': 'Done training!'}
experiment id: f57e0678d4ba405f8951bffa39cd5a4d
writing training script to s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/function_artifacts/train.py
all_params = [
{
'lr': '0.001',
'dropout_rate': '0.2',
'batch_size': '256',
'epochs': '8'
},
{
'lr': '0.0003',
'dropout_rate': '0.3',
'batch_size': '256',
'epochs': '16'
},
{
'lr': '0.001',
'dropout_rate': '0.0',
'batch_size': '256',
'epochs': '8'
},
{
'lr': '0.001',
'dropout_rate': '0.4',
'batch_size': '256',
'epochs': '8'
}
]
for i, param in enumerate(all_params):
run = experiment.create_run(
params=[Param(key=k, value=v) for k, v in param.items()],
name=f'pytorch-tracking-local-run-{i}'
)
run.start(asynchronous=False)
print('\n-----------------------------------\n')
Epoch 0 Train Loss: 0.6609, Train Accuracy: 0.7988, Validation Loss: 0.195886, Validation Accuracy: 0.9399
Epoch 1 Train Loss: 0.178703, Train Accuracy: 0.9454, Validation Loss: 0.115134, Validation Accuracy: 0.9651
Epoch 2 Train Loss: 0.117643, Train Accuracy: 0.9637, Validation Loss: 0.085653, Validation Accuracy: 0.9747
Epoch 3 Train Loss: 0.09313, Train Accuracy: 0.9713, Validation Loss: 0.06971, Validation Accuracy: 0.9779
Epoch 4 Train Loss: 0.079642, Train Accuracy: 0.9755, Validation Loss: 0.062604, Validation Accuracy: 0.9811
Epoch 5 Train Loss: 0.068624, Train Accuracy: 0.9784, Validation Loss: 0.059304, Validation Accuracy: 0.9815
Epoch 6 Train Loss: 0.060573, Train Accuracy: 0.9808, Validation Loss: 0.055997, Validation Accuracy: 0.9823
Epoch 7 Train Loss: 0.054952, Train Accuracy: 0.9828, Validation Loss: 0.047025, Validation Accuracy: 0.9851
Run 2b7dfbd1b0c54ce0a2ab5105a1e9c895 Test Loss: 0.036447, Test Accuracy: 0.9873
creating model artifact for run with id 2b7dfbd1b0c54ce0a2ab5105a1e9c895
uploading artifact for run with id 2b7dfbd1b0c54ce0a2ab5105a1e9c895 to uri s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/runs/2b7dfbd1b0c54ce0a2ab5105a1e9c895/model_artifacts/model.pth
-----------------------------------
Epoch 0 Train Loss: 1.174378, Train Accuracy: 0.6664, Validation Loss: 0.397005, Validation Accuracy: 0.8889
Epoch 1 Train Loss: 0.388981, Train Accuracy: 0.8826, Validation Loss: 0.230925, Validation Accuracy: 0.9302
Epoch 2 Train Loss: 0.267694, Train Accuracy: 0.9186, Validation Loss: 0.167058, Validation Accuracy: 0.9521
Epoch 3 Train Loss: 0.208828, Train Accuracy: 0.9366, Validation Loss: 0.13145, Validation Accuracy: 0.9622
Epoch 4 Train Loss: 0.176249, Train Accuracy: 0.9469, Validation Loss: 0.110591, Validation Accuracy: 0.9675
Epoch 5 Train Loss: 0.154967, Train Accuracy: 0.9533, Validation Loss: 0.094471, Validation Accuracy: 0.9725
Epoch 6 Train Loss: 0.137394, Train Accuracy: 0.9576, Validation Loss: 0.087171, Validation Accuracy: 0.9733
Epoch 7 Train Loss: 0.123557, Train Accuracy: 0.9627, Validation Loss: 0.077809, Validation Accuracy: 0.9767
Epoch 8 Train Loss: 0.113044, Train Accuracy: 0.965, Validation Loss: 0.069306, Validation Accuracy: 0.9791
Epoch 9 Train Loss: 0.105859, Train Accuracy: 0.9673, Validation Loss: 0.065293, Validation Accuracy: 0.9799
Epoch 10 Train Loss: 0.099071, Train Accuracy: 0.9687, Validation Loss: 0.06079, Validation Accuracy: 0.9817
Epoch 11 Train Loss: 0.093181, Train Accuracy: 0.971, Validation Loss: 0.057294, Validation Accuracy: 0.9823
Epoch 12 Train Loss: 0.086656, Train Accuracy: 0.9735, Validation Loss: 0.05462, Validation Accuracy: 0.9835
Epoch 13 Train Loss: 0.083903, Train Accuracy: 0.9741, Validation Loss: 0.052157, Validation Accuracy: 0.9841
Epoch 14 Train Loss: 0.078155, Train Accuracy: 0.976, Validation Loss: 0.051512, Validation Accuracy: 0.9847
Epoch 15 Train Loss: 0.074646, Train Accuracy: 0.9776, Validation Loss: 0.0521, Validation Accuracy: 0.9845
Run a9485ed00f864d66897e98b987a20d5d Test Loss: 0.044426, Test Accuracy: 0.9856
creating model artifact for run with id a9485ed00f864d66897e98b987a20d5d
uploading artifact for run with id a9485ed00f864d66897e98b987a20d5d to uri s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/runs/a9485ed00f864d66897e98b987a20d5d/model_artifacts/model.pth
-----------------------------------
Epoch 0 Train Loss: 0.573786, Train Accuracy: 0.8376, Validation Loss: 0.171209, Validation Accuracy: 0.9493
Epoch 1 Train Loss: 0.132115, Train Accuracy: 0.9606, Validation Loss: 0.10278, Validation Accuracy: 0.9704
Epoch 2 Train Loss: 0.087892, Train Accuracy: 0.9721, Validation Loss: 0.085383, Validation Accuracy: 0.975
Epoch 3 Train Loss: 0.070585, Train Accuracy: 0.9783, Validation Loss: 0.079534, Validation Accuracy: 0.9775
Epoch 4 Train Loss: 0.059749, Train Accuracy: 0.9816, Validation Loss: 0.06319, Validation Accuracy: 0.9805
Epoch 5 Train Loss: 0.049725, Train Accuracy: 0.9847, Validation Loss: 0.059211, Validation Accuracy: 0.9814
Epoch 6 Train Loss: 0.043025, Train Accuracy: 0.9862, Validation Loss: 0.058358, Validation Accuracy: 0.982
Epoch 7 Train Loss: 0.038978, Train Accuracy: 0.9882, Validation Loss: 0.053679, Validation Accuracy: 0.9843
Run c25d984c17a9400cb9243ea848d52994 Test Loss: 0.041403, Test Accuracy: 0.9861
creating model artifact for run with id c25d984c17a9400cb9243ea848d52994
uploading artifact for run with id c25d984c17a9400cb9243ea848d52994 to uri s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/runs/c25d984c17a9400cb9243ea848d52994/model_artifacts/model.pth
-----------------------------------
Epoch 0 Train Loss: 0.744087, Train Accuracy: 0.7684, Validation Loss: 0.218015, Validation Accuracy: 0.9322
Epoch 1 Train Loss: 0.247632, Train Accuracy: 0.9245, Validation Loss: 0.125285, Validation Accuracy: 0.9594
Epoch 2 Train Loss: 0.177967, Train Accuracy: 0.9466, Validation Loss: 0.105135, Validation Accuracy: 0.9659
Epoch 3 Train Loss: 0.148905, Train Accuracy: 0.9554, Validation Loss: 0.082362, Validation Accuracy: 0.9729
Epoch 4 Train Loss: 0.132342, Train Accuracy: 0.9598, Validation Loss: 0.072726, Validation Accuracy: 0.9757
Epoch 5 Train Loss: 0.114922, Train Accuracy: 0.9658, Validation Loss: 0.069842, Validation Accuracy: 0.9774
Epoch 6 Train Loss: 0.107409, Train Accuracy: 0.9677, Validation Loss: 0.060053, Validation Accuracy: 0.98
Epoch 7 Train Loss: 0.096176, Train Accuracy: 0.9713, Validation Loss: 0.0548, Validation Accuracy: 0.9816
Run c8db866b8d0f49639055c22d49784f5b Test Loss: 0.044026, Test Accuracy: 0.9836
creating model artifact for run with id c8db866b8d0f49639055c22d49784f5b
uploading artifact for run with id c8db866b8d0f49639055c22d49784f5b to uri s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/runs/c8db866b8d0f49639055c22d49784f5b/model_artifacts/model.pth
-----------------------------------
experiment = Experiment.list_single({'name': experiment_name})
print(experiment._id, experiment_name)
f57e0678d4ba405f8951bffa39cd5a4d tracking-pytorch-local
best_runs = experiment.search_runs(metric='test accuracy', ascending=False, n=2)
print('\n'.join([str([(m.key, m.value) for m in run.metrics]) for run in best_runs]))
[('test accuracy', 0.9873), ('test loss', 0.03644732084794668)]
[('test accuracy', 0.9861), ('test loss', 0.04140268654300598)]
# print out the metrics for all the runs
for r in experiment.runs:
print(r._id, r.name, [(m.key, m.value) for m in r.metrics], r.experiment_id)
2b7dfbd1b0c54ce0a2ab5105a1e9c895 pytorch-tracking-local-run-0 [('test accuracy', 0.9873), ('test loss', 0.03644732084794668)] f57e0678d4ba405f8951bffa39cd5a4d
a9485ed00f864d66897e98b987a20d5d pytorch-tracking-local-run-1 [('test accuracy', 0.9856), ('test loss', 0.044425528604188004)] f57e0678d4ba405f8951bffa39cd5a4d
c25d984c17a9400cb9243ea848d52994 pytorch-tracking-local-run-2 [('test accuracy', 0.9861), ('test loss', 0.04140268654300598)] f57e0678d4ba405f8951bffa39cd5a4d
c8db866b8d0f49639055c22d49784f5b pytorch-tracking-local-run-3 [('test accuracy', 0.9836), ('test loss', 0.04402594630373642)] f57e0678d4ba405f8951bffa39cd5a4d
# maybe change this to order by key, then step. Right now it's ordered by step, then key.
run_time_series_metrics_df = best_runs[0].get_time_series_dataframe()
print(run_time_series_metrics_df)
step key value
0 0 validation accuracy 0.939867
1 0 validation loss 0.195886
2 0 train loss 0.660900
3 0 train accuracy 0.798800
4 1 validation accuracy 0.965133
5 1 validation loss 0.115134
6 1 train loss 0.178703
7 1 train accuracy 0.945378
8 2 validation accuracy 0.974733
9 2 validation loss 0.085653
10 2 train loss 0.117643
11 2 train accuracy 0.963711
12 3 validation accuracy 0.977933
13 3 validation loss 0.069710
14 3 train loss 0.093130
15 3 train accuracy 0.971311
16 4 validation accuracy 0.981133
17 4 validation loss 0.062604
18 4 train loss 0.079642
19 4 train accuracy 0.975489
20 5 validation accuracy 0.981533
21 5 validation loss 0.059304
22 5 train loss 0.068624
23 5 train accuracy 0.978422
24 6 validation accuracy 0.982267
25 6 validation loss 0.055997
26 6 train loss 0.060573
27 6 train accuracy 0.980822
28 7 validation accuracy 0.985133
29 7 validation loss 0.047025
30 7 train loss 0.054952
31 7 train accuracy 0.982822
best_model_artifact = best_runs[0].model_artifacts[0]
print('s3://' + best_model_artifact.storage_path)
s3://<REMOVED BUCKET NAME>/experiments/f57e0678d4ba405f8951bffa39cd5a4d/runs/2b7dfbd1b0c54ce0a2ab5105a1e9c895/model_artifacts/model.pth
Use the Artifact.open()
method to achieve this.
Examples
Run.model_artifacts[0].open()
Experiment.runs[i].model_artifacts[0].open()
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt
%matplotlib inline
# Need to define the model architecture again
# because the architecture is not saved with the model
class CNN(nn.Module):
def __init__(self, dropout_rate=0.4):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
self.dropout = nn.Dropout(dropout_rate)
self.fc1 = nn.Linear(800, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
with best_model_artifact.open() as f:
model = CNN()
model.load_state_dict(torch.load(f))
model.eval()
# only need one sample from the test set
transform = transforms.Compose([transforms.ToTensor()])
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=True)
data, target = next(iter(test_loader))
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
print(f'Prediction: {pred.item()}')
plt.imshow(data[0].squeeze(), cmap='gray')
plt.show()
Prediction: 4
experiment_df = experiment.get_experiment_df()
print(experiment_df)
run_id lr dropout_rate batch_size epochs \
0 2b7dfbd1b0c54ce0a2ab5105a1e9c895 0.001 0.2 256 8
1 a9485ed00f864d66897e98b987a20d5d 0.0003 0.3 256 16
2 c25d984c17a9400cb9243ea848d52994 0.001 0.0 256 8
3 c8db866b8d0f49639055c22d49784f5b 0.001 0.4 256 8
test accuracy test loss
0 0.9873 0.036447
1 0.9856 0.044426
2 0.9861 0.041403
3 0.9836 0.044026
To execute runs remotely and in parallel, pass asynchronous=True
to the run.start() method
The code in the cell below is unchanged from earlier, except for
tracking-pytorch-remote
pytorch-tracking-remote-run-{index}
experiment_name = 'tracking-pytorch-remote'
experiment = Tracker.create_experiment(name=experiment_name)
print(f'experiment id: {experiment._id}')
@experiment.protocol
def train(params):
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import guidepad
guidepad.initialize()
from guidepad_ml.tracking.types import Run
run = Run.list_single({'_id': params.run_id})
# create dataloaders for train/validation/test
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = MNIST(root='./data', train=False, download=True, transform=transform)
train_proportion = 0.9
train_size = int(train_proportion * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
train_dataset, [train_size, val_size]
)
train_loader = DataLoader(dataset=train_dataset, batch_size=int(params.batch_size), shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=int(params.batch_size), shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=int(params.batch_size), shuffle=False)
# define model architecture
class CNN(nn.Module):
def __init__(self, dropout_rate=0.4):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
self.dropout = nn.Dropout(dropout_rate)
self.fc1 = nn.Linear(800, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
model = CNN(dropout_rate=float(params.dropout_rate)).to(params.device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=float(params.lr))
def train_epoch(model, train_loader, criterion, optimizer, epoch, device):
model.train()
train_loss = 0
correct = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
accuracy = correct / len(train_loader.dataset)
return train_loss / len(train_loader), accuracy
def validate(model, val_loader):
model.eval()
val_loss = 0
correct = 0
with torch.no_grad():
for data, target in val_loader:
data, target = data.to(params.device), target.to(params.device)
output = model(data)
val_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
val_loss /= len(val_loader)
accuracy = correct / len(val_loader.dataset)
return val_loss, accuracy
def test(model, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(params.device), target.to(params.device)
output = model(data)
test_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader)
accuracy = correct / len(test_loader.dataset)
return test_loss, accuracy
# train model
for epoch in range(int(params.epochs)):
train_loss, train_accuracy = train_epoch(model, train_loader, criterion, optimizer, epoch, device=params.device)
val_loss, val_accuracy = validate(model, val_loader)
metrics = {
'validation accuracy': val_accuracy,
'validation loss': val_loss,
'train loss': train_loss,
'train accuracy': train_accuracy
}
# log metrics after each epoch
print(f'Epoch {epoch} Train Loss: {round(train_loss, 6)}, Train Accuracy: {round(train_accuracy, 4)}, Validation Loss: {round(val_loss, 6)}, Validation Accuracy: {round(val_accuracy, 4)}')
run = run.log_time_series_metrics(metrics, step=epoch)
# log final metrics
model.eval()
test_loss, accuracy = test(model, test_loader)
print(f'Run {params.run_id} Test Loss: {round(test_loss, 6)}, Test Accuracy: {accuracy}')
run = run.log_metrics({'test accuracy': accuracy, 'test loss': test_loss})
filename = 'model.pth'
model_dir = f'models/{params.run_id}'
model_filepath = os.path.join(model_dir, filename)
os.makedirs(model_dir, exist_ok=True)
torch.save(model.state_dict(), model_filepath)
# If we wanted, we could only log the model if the test accuracy is above a certain threshold
run = run.log_model_artifact(filepath=model_filepath)
return {'message': 'Done training!'}
experiment id: f3c34b053548486e954e41f978d6f3d6
writing training script to s3://<REMOVED BUCKET NAME>/experiments/f3c34b053548486e954e41f978d6f3d6/function_artifacts/train.py
all_params = [
{
'lr': '0.001',
'dropout_rate': '0.25',
'batch_size': '256',
'epochs': '8'
},
{
'lr': '0.0003',
'dropout_rate': '0.35',
'batch_size': '256',
'epochs': '16'
},
{
'lr': '0.001',
'dropout_rate': '0.05',
'batch_size': '256',
'epochs': '8'
},
{
'lr': '0.001',
'dropout_rate': '0.45',
'batch_size': '256',
'epochs': '8'
}
]
for i, param in enumerate(all_params):
run = experiment.create_run(
params=[Param(key=k, value=v) for k, v in param.items()],
name=f'pytorch-tracking-remote-run-{i}'
)
# These run asynchronously, in parallel, in separate pods in Kubernetes
run.start(asynchronous=True)
print(f'Run {run.name} started')
Started run with id eba7a02b3af947c28f016c1b9450570d in invocation {'invocation_id': 'ee7b09818b1446d49c503b5f0ac738d9'}
Run pytorch-tracking-remote-run-0 started
Started run with id ff0f2affde734e80be46b57f9f040939 in invocation {'invocation_id': '31941f79000840bb9adf61d576a5bf61'}
Run pytorch-tracking-remote-run-1 started
Started run with id ab32e715f645415abb178063799e34d3 in invocation {'invocation_id': '9de17c9a6a2d493bac5ddf2efc96045c'}
Run pytorch-tracking-remote-run-2 started
Started run with id 27a7c342814347a88352eedc49c35355 in invocation {'invocation_id': '0014b60d46bb4f0f85829eae9d5ece9c'}
Run pytorch-tracking-remote-run-3 started
Identify the pod name of a run with kubectl -n <namespace> get pods
Stream the logs with kubectl -n <namespace> logs -f <pod-name>
status
attribute of the invocation record that is created for it.The status code mapping is as follows:
NOT_STARTED = 0
PENDING = 1
IDLE = 2
RUNNING = 3
FINISHED = 4
ERROR = 5
PAUSED = 6
Print out the status of all of the run invocations.
from guidepad.workplan.invocation import WorkPlanInvocation
import time
invocation_ids = [
'ee7b09818b1446d49c503b5f0ac738d9',
'31941f79000840bb9adf61d576a5bf61',
'9de17c9a6a2d493bac5ddf2efc96045c',
'0014b60d46bb4f0f85829eae9d5ece9c'
]
for invocation_id in invocation_ids:
invocation = WorkPlanInvocation.list_single({'_id': invocation_id})
print(invocation._id, invocation.status)
ee7b09818b1446d49c503b5f0ac738d9 3
31941f79000840bb9adf61d576a5bf61 3
9de17c9a6a2d493bac5ddf2efc96045c 3
0014b60d46bb4f0f85829eae9d5ece9c 3
Wait for the runs to finish.
no_errors = True
while no_errors and invocation_ids:
for invocation in WorkPlanInvocation.list({'_id': {'$in': invocation_ids}}):
if invocation.status == 5:
no_errors = False
print(f'Invocation {invocation._id} failed')
elif invocation.status == 4:
print(f'Invocation {invocation._id} succeeded')
invocation_ids.remove(invocation._id)
if invocation_ids:
time.sleep(30)
Invocation 9de17c9a6a2d493bac5ddf2efc96045c succeeded
Invocation ee7b09818b1446d49c503b5f0ac738d9 succeeded
Invocation 0014b60d46bb4f0f85829eae9d5ece9c succeeded
Invocation 31941f79000840bb9adf61d576a5bf61 succeeded
# Retrieve the experiment
experiment = Experiment.list_single({'name': experiment_name})
# Retrieve the best two runs
best_runs = experiment.search_runs(metric='test accuracy', ascending=False, n=2)
# Print out the metrics for the best two runs
print('Metrics for the best two runs:')
print('\n'.join([str([(m.key, m.value) for m in run.metrics]) for run in best_runs]))
# print out the metrics for all the runs
print('')
for r in experiment.runs:
print(r._id, r.name, [(m.key, m.value) for m in r.metrics], r.experiment_id)
# Retrieve the time series metrics for the best run
run_time_series_metrics_df = best_runs[0].get_time_series_dataframe()
print('\nTime series metrics for the best run:')
print(run_time_series_metrics_df)
# Print out the S3 path to the best model artifact
print('\nBest model artifact:')
best_model_artifact = best_runs[0].model_artifacts[0]
print('s3://' + best_model_artifact.storage_path)
# Retrieve the experiment dataframe
experiment_df = experiment.get_experiment_df()
print('\nExperiment dataframe:')
print(experiment_df)
Metrics for the best two runs:
[('test accuracy', 0.9897), ('test loss', 0.028761554773518584)]
[('test accuracy', 0.9892), ('test loss', 0.033548171258735236)]
eba7a02b3af947c28f016c1b9450570d pytorch-tracking-remote-run-0 [('test accuracy', 0.9892), ('test loss', 0.033548171258735236)] f3c34b053548486e954e41f978d6f3d6
ff0f2affde734e80be46b57f9f040939 pytorch-tracking-remote-run-1 [('test accuracy', 0.9865), ('test loss', 0.0374485233289306)] f3c34b053548486e954e41f978d6f3d6
ab32e715f645415abb178063799e34d3 pytorch-tracking-remote-run-2 [('test accuracy', 0.9897), ('test loss', 0.028761554773518584)] f3c34b053548486e954e41f978d6f3d6
27a7c342814347a88352eedc49c35355 pytorch-tracking-remote-run-3 [('test accuracy', 0.9858), ('test loss', 0.039154025849711616)] f3c34b053548486e954e41f978d6f3d6
Time series metrics for the best run:
step key value
0 0 validation accuracy 0.953500
1 0 validation loss 0.156264
2 0 train loss 0.510594
3 0 train accuracy 0.850241
4 1 validation accuracy 0.971500
5 1 validation loss 0.098873
6 1 train loss 0.114610
7 1 train accuracy 0.965889
8 2 validation accuracy 0.977667
9 2 validation loss 0.073801
10 2 train loss 0.074381
11 2 train accuracy 0.977630
12 3 validation accuracy 0.984167
13 3 validation loss 0.058047
14 3 train loss 0.058351
15 3 train accuracy 0.982111
16 4 validation accuracy 0.986500
17 4 validation loss 0.049536
18 4 train loss 0.046574
19 4 train accuracy 0.985537
20 5 validation accuracy 0.986333
21 5 validation loss 0.047834
22 5 train loss 0.040462
23 5 train accuracy 0.987074
24 6 validation accuracy 0.986167
25 6 validation loss 0.044351
26 6 train loss 0.034419
27 6 train accuracy 0.989241
28 7 validation accuracy 0.990833
29 7 validation loss 0.035157
30 7 train loss 0.031094
31 7 train accuracy 0.990333
Best model artifact:
s3://<REMOVED BUCKET NAME>/experiments/f3c34b053548486e954e41f978d6f3d6/runs/ab32e715f645415abb178063799e34d3/model_artifacts/model.pth
Experiment dataframe:
run_id lr dropout_rate batch_size epochs \
0 eba7a02b3af947c28f016c1b9450570d 0.001 0.25 256 8
1 ff0f2affde734e80be46b57f9f040939 0.0003 0.35 256 16
2 ab32e715f645415abb178063799e34d3 0.001 0.05 256 8
3 27a7c342814347a88352eedc49c35355 0.001 0.45 256 8
test accuracy test loss
0 0.9892 0.033548
1 0.9865 0.037449
2 0.9897 0.028762
3 0.9858 0.039154
with best_model_artifact.open() as f:
model = CNN()
model.load_state_dict(torch.load(f))
model.eval()
# only need one sample from the test set
transform = transforms.Compose([transforms.ToTensor()])
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=True)
data, target = next(iter(test_loader))
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
print(f'Prediction: {pred.item()}')
plt.imshow(data[0].squeeze(), cmap='gray')
plt.show()
Prediction: 9
print(target)
tensor([9])
Guidepad's ML Plugin
Jul 28, 2023 · 10 min read read
Guidepad's Managed Embeddings Service (Part 1)
Aug 8, 2023 · 10 min read read
Guidepad's Managed Embeddings Service (Part 2)
Aug 8, 2023 · 10 min read read