Train an TensorFlow model with a SageMaker Training Job and track it using SageMaker Experiments
This notebook’s CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook.
This notebook shows how you can use the SageMaker SDK to track a Machine Learning experiment.
We introduce two concepts in this notebook -
Experiment: An experiment is a collection of runs. When you initialize a run in your training loop, you include the name of the experiment that the run belongs to. Experiment names must be unique within your AWS account.
Run: A run consists of all the inputs, parameters, configurations, and results for one iteration of model training. Initialize an experiment run for tracking a training job with Run().
In this notebook we train a Keras model using the MNIST dataset on a remote SageMaker instance using a training job.
[ ]:
import sys
[ ]:
# update boto3 and sagemaker to ensure latest SDK version
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install --upgrade boto3
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install --upgrade tensorflow
[ ]:
import os
import boto3
import json
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.experiments.run import Run
from sagemaker.utils import unique_name_from_base
[ ]:
sagemaker_session = Session()
boto_sess = boto3.Session()
role = get_execution_role()
default_bucket = sagemaker_session.default_bucket()
sm = boto_sess.client("sagemaker")
region = boto_sess.region_name
Prepare the training script
Here we use a SageMaker Training job to train the model on a remote instance.
[ ]:
!mkdir -p script
[ ]:
%%writefile ./script/train.py
import os
os.system("pip install -U sagemaker")
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import argparse
from sagemaker.session import Session
from sagemaker.experiments import load_run
import boto3
boto_session = boto3.session.Session(region_name=os.environ["REGION"])
sagemaker_session = Session(boto_session=boto_session)
s3 = boto3.client("s3")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--dropout", type=float, default=0.01)
return parser.parse_known_args()
class ExperimentCallback(keras.callbacks.Callback):
""" """
def __init__(self, run, model, x_test, y_test):
"""Save params in constructor"""
self.run = run
self.model = model
self.x_test = x_test
self.y_test = y_test
def on_epoch_end(self, epoch, logs=None):
""" """
keys = list(logs.keys())
for key in keys:
self.run.log_metric(name=key, value=logs[key], step=epoch)
print("{} -> {}".format(key, logs[key]))
def load_data():
num_classes = 10
input_shape = (28, 28, 1)
train_path = "input_train.npy"
test_path = "input_test.npy"
train_labels_path = "input_train_labels.npy"
test_labels_path = "input_test_labels.npy"
# Load the data and split it between train and test sets
s3.download_file(
f"sagemaker-example-files-prod-{os.environ['REGION']}", "datasets/image/MNIST/numpy/input_train.npy", train_path
)
s3.download_file(
f"sagemaker-example-files-prod-{os.environ['REGION']}", "datasets/image/MNIST/numpy/input_test.npy", test_path
)
s3.download_file(
f"sagemaker-example-files-prod-{os.environ['REGION']}",
"datasets/image/MNIST/numpy/input_train_labels.npy",
train_labels_path,
)
s3.download_file(
f"sagemaker-example-files-prod-{os.environ['REGION']}",
"datasets/image/MNIST/numpy/input_test_labels.npy",
test_labels_path,
)
x_train = np.load(train_path)
x_test = np.load(test_path)
y_train = np.load(train_labels_path)
y_test = np.load(test_labels_path)
# Reshape the arrays
x_train = np.reshape(x_train, (60000, 28, 28))
x_test = np.reshape(x_test, (10000, 28, 28))
y_train = np.reshape(y_train, (60000,))
y_test = np.reshape(y_test, (10000,))
# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
return x_train, x_test, y_train, y_test
def main():
""" """
args, _ = parse_args()
print("Args are : ", args)
num_classes = 10
input_shape = (28, 28, 1)
x_train, x_test, y_train, y_test = load_data()
model = keras.Sequential(
[
keras.Input(shape=input_shape),
layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Flatten(),
layers.Dropout(args.dropout),
layers.Dense(num_classes, activation="softmax"),
]
)
model.summary()
batch_size = args.batch_size
epochs = args.epochs
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
###
# `load_run` will use the run defined when calling the estimator
###
with load_run(sagemaker_session=sagemaker_session) as run:
model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs,
validation_split=0.1,
callbacks=[ExperimentCallback(run, model, x_test, y_test)],
)
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])
run.log_metric(name="Final Test Loss", value=score[0])
run.log_metric(name="Final Test Accuracy", value=score[1])
model.save("/opt/ml/model")
if __name__ == "__main__":
main()
Create an Experiment and launch a training job
[ ]:
from sagemaker.tensorflow.estimator import TensorFlow
from sagemaker.experiments.run import Run
exp_name = "tensorflow-script-mode-experiment"
batch_size = 256
epochs = 5
dropout = 0.1
with Run(
experiment_name=exp_name,
sagemaker_session=sagemaker_session,
) as run:
run.log_parameter("batch_size", batch_size)
run.log_parameter("epochs", epochs)
run.log_parameter("dropout", dropout)
est = TensorFlow(
entry_point="./script/train.py",
role=role,
model_dir=False,
hyperparameters={"epochs": epochs, "batch_size": batch_size, "dropout": dropout},
framework_version="2.8",
py_version="py39",
instance_type="ml.m5.xlarge",
instance_count=1,
keep_alive_period_in_seconds=3600,
environment={"REGION": region},
)
est.fit()
[ ]:
est.model_data
Register the trained model in the Model Registry
This is an optional step users can take if they want to keep track of their models in a central model catalog.
[ ]:
from sagemaker.tensorflow.model import TensorFlowModel
model = TensorFlowModel(model_data=est.model_data, role=role, framework_version="2.8")
[ ]:
model.register(
model_package_group_name="tensorflow-script-mode-model",
content_types=["text/csv"],
inference_instances=["ml.m5.xlarge"],
transform_instances=["ml.m5.xlarge"],
response_types=["text/csv"],
approval_status="PendingManualApproval",
)
Notebook CI Test Results
This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.