{ "cells": [ { "cell_type": "markdown", "id": "aba29661-2e0d-4154-bda7-3734e0db929b", "metadata": {}, "source": [ "# SageMaker HPO with MLflow" ] }, { "cell_type": "markdown", "id": "78515a73", "metadata": {}, "source": [ "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook.\n", "\n", "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)" ] }, { "cell_type": "markdown", "id": "ad2f8898-a6d1-4efb-83f9-e73d48401a10", "metadata": {}, "source": [ "Train a PyTorch model using HPO in SageMaker and track with MLflow using nested runs" ] }, { "cell_type": "markdown", "id": "ac89ce60-66b1-48e8-a0aa-d2dc7be4c7a0", "metadata": {}, "source": [ "## Setup environment" ] }, { "cell_type": "markdown", "id": "f77548cf-8c27-4a48-96e5-15b240d2d3fb", "metadata": {}, "source": [ "Install necessary libraries" ] }, { "cell_type": "code", "execution_count": null, "id": "676cb6d2-be92-4682-9f23-7c1feb8a619d", "metadata": {}, "outputs": [], "source": [ "!pip install torchvision mlflow==2.13.2 sagemaker-mlflow==0.1.0" ] }, { "cell_type": "markdown", "id": "3454d086-3642-4a72-a9fd-f97935e28de7", "metadata": {}, "source": [ "Import necessary libraries" ] }, { "cell_type": "code", "execution_count": null, "id": "c0308c7a-f89d-4f82-845a-6168ade2cea3", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "from torchvision import transforms\n", "from torchvision.datasets import MNIST\n", "\n", "import mlflow\n", "import sagemaker\n", "from sagemaker import get_execution_role\n", "from sagemaker.pytorch import PyTorch\n", "from sagemaker.tuner import (\n", " CategoricalParameter,\n", " ContinuousParameter,\n", " HyperparameterTuner,\n", " IntegerParameter,\n", ")" ] }, { "cell_type": "markdown", "id": "ce6690c7-fdcb-486c-b4e7-4cb0b29ad63e", "metadata": {}, "source": [ "Declare some variables used later" ] }, { "cell_type": "code", "execution_count": null, "id": "3fa015b1-13b9-41b7-b42c-bf227efdac0a", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Define session, role, and region so we can\n", "# perform any SageMaker tasks we need\n", "sagemaker_session = sagemaker.Session()\n", "role = get_execution_role()\n", "region = sagemaker_session.boto_region_name\n", "bucket = sagemaker_session.default_bucket()\n", "\n", "# S3 prefix for the training dataset to be uploaded to\n", "prefix = \"DEMO-pytorch-mnist\"\n", "\n", "# MLflow (replace these values with your own)\n", "tracking_server_arn = \"your tracking server arn\"\n", "experiment_name = \"MNIST\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e551dbac", "metadata": {}, "outputs": [], "source": [ "!mkdir -p training_code" ] }, { "cell_type": "markdown", "id": "0afcafe4-2462-4831-94f5-5bc71371a8a0", "metadata": {}, "source": [ "## Get some training data" ] }, { "cell_type": "markdown", "id": "fc751432-d2a5-44e4-b1e2-ac0454538f67", "metadata": {}, "source": [ "Download MNIST data" ] }, { "cell_type": "code", "execution_count": null, "id": "6d54a0c8-ec3c-47b7-8f4a-c3a2d45d4236", "metadata": {}, "outputs": [], "source": [ "local_dir = \"data\"\n", "MNIST.mirrors = [\n", " f\"https://sagemaker-example-files-prod-{region}.s3.amazonaws.com/datasets/image/MNIST/\"\n", "]\n", "MNIST(\n", " local_dir,\n", " download=True,\n", " transform=transforms.Compose(\n", " [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n", " ),\n", ")" ] }, { "cell_type": "markdown", "id": "ab046942-ef87-4b0b-a0b4-02f194dc370a", "metadata": {}, "source": [ "Upload data to S3" ] }, { "cell_type": "code", "execution_count": null, "id": "5b4c0b01-8532-4184-918e-35904f25fc14", "metadata": {}, "outputs": [], "source": [ "train_input = sagemaker_session.upload_data(path=\"data\", bucket=bucket, key_prefix=prefix)" ] }, { "cell_type": "markdown", "id": "592db81e-089c-4158-a6a0-16ae2a6b94af", "metadata": {}, "source": [ "### Write your training script" ] }, { "cell_type": "code", "execution_count": null, "id": "6f6b33ed-753e-4b71-b02f-8bdbd414aae1", "metadata": {}, "outputs": [], "source": [ "%%writefile training_code/mnist.py\n", "\n", "import argparse\n", "import json\n", "import logging\n", "import os\n", "import sys\n", "\n", "import mlflow\n", "import torch\n", "import torch.distributed as dist\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "import torch.utils.data\n", "import torch.utils.data.distributed\n", "from torchinfo import summary\n", "from torchvision import datasets, transforms\n", "\n", "logger = logging.getLogger(__name__)\n", "logger.setLevel(logging.DEBUG)\n", "logger.addHandler(logging.StreamHandler(sys.stdout))\n", "\n", "# Set MLFlow specifics\n", "parent_run_id = os.environ.get('MLFLOW_PARENT_RUN_ID', None)\n", "mlflow_experiment_name = os.environ.get('MLFLOW_EXPERIMENT_NAME', None)\n", "\n", "# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py\n", "class Net(nn.Module):\n", " def __init__(self):\n", " super(Net, self).__init__()\n", " self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n", " self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n", " self.conv2_drop = nn.Dropout2d()\n", " self.fc1 = nn.Linear(320, 50)\n", " self.fc2 = nn.Linear(50, 10)\n", "\n", " def forward(self, x):\n", " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", " x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n", " x = x.view(-1, 320)\n", " x = F.relu(self.fc1(x))\n", " x = F.dropout(x, training=self.training)\n", " x = self.fc2(x)\n", " return F.log_softmax(x, dim=1)\n", "\n", "\n", "def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs):\n", " logger.info('Get train data loader')\n", " dataset = datasets.MNIST(\n", " training_dir,\n", " train=True,\n", " transform=transforms.Compose(\n", " [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n", " ),\n", " )\n", " train_sampler = (\n", " torch.utils.data.distributed.DistributedSampler(dataset)\n", " if is_distributed\n", " else None\n", " )\n", " return torch.utils.data.DataLoader(\n", " dataset,\n", " batch_size=batch_size,\n", " shuffle=train_sampler is None,\n", " sampler=train_sampler,\n", " **kwargs,\n", " )\n", "\n", "\n", "def _get_test_data_loader(test_batch_size, training_dir, **kwargs):\n", " logger.info('Get test data loader')\n", " return torch.utils.data.DataLoader(\n", " datasets.MNIST(\n", " training_dir,\n", " train=False,\n", " transform=transforms.Compose(\n", " [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n", " ),\n", " ),\n", " batch_size=test_batch_size,\n", " shuffle=True,\n", " **kwargs,\n", " )\n", "\n", "\n", "def _average_gradients(model):\n", " # Gradient averaging.\n", " size = float(dist.get_world_size())\n", " for param in model.parameters():\n", " dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM, group=0)\n", " param.grad.data /= size\n", "\n", "\n", "def train(args):\n", " is_distributed = len(args.hosts) > 1 and args.backend is not None\n", " logger.debug('Distributed training - {}'.format(is_distributed))\n", " use_cuda = args.num_gpus > 0\n", " logger.debug('Number of gpus available - {}'.format(args.num_gpus))\n", " kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}\n", " device = torch.device('cuda' if use_cuda else 'cpu')\n", "\n", " region = os.getenv('AWS_REGION')\n", "\n", " # if there's a parent_run_id run as nested MLflow_run\n", " nested = False\n", "\n", " if parent_run_id:\n", " nested = True\n", "\n", " if is_distributed:\n", " # Initialize the distributed environment.\n", " world_size = len(args.hosts)\n", " os.environ['WORLD_SIZE'] = str(world_size)\n", " host_rank = args.hosts.index(args.current_host)\n", " dist.init_process_group(\n", " backend=args.backend, rank=host_rank, world_size=world_size\n", " )\n", " logger.info(\n", " \"Initialized the distributed environment: '{}' backend on {} nodes. \".format(\n", " args.backend, dist.get_world_size()\n", " )\n", " + \"Current host rank is {}. Number of gpus: {}\".format(\n", " dist.get_rank(), args.num_gpus\n", " )\n", " )\n", "\n", " # set the seed for generating random numbers\n", " torch.manual_seed(args.seed)\n", " if use_cuda:\n", " torch.cuda.manual_seed(args.seed)\n", "\n", " train_loader = _get_train_data_loader(\n", " args.batch_size, args.data_dir, is_distributed, **kwargs\n", " )\n", " test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs)\n", "\n", " logger.debug(\n", " \"Processes {}/{} ({:.0f}%) of train data\".format(\n", " len(train_loader.sampler),\n", " len(train_loader.dataset),\n", " 100.0 * len(train_loader.sampler) / len(train_loader.dataset),\n", " )\n", " )\n", "\n", " logger.debug(\n", " \"Processes {}/{} ({:.0f}%) of test data\".format(\n", " len(test_loader.sampler),\n", " len(test_loader.dataset),\n", " 100.0 * len(test_loader.sampler) / len(test_loader.dataset),\n", " )\n", " )\n", "\n", " model = Net().to(device)\n", " if is_distributed and use_cuda:\n", " # multi-machine multi-gpu case\n", " model = torch.nn.parallel.DistributedDataParallel(model)\n", " else:\n", " # single-machine multi-gpu case or single-machine or multi-machine cpu case\n", " model = torch.nn.DataParallel(model)\n", "\n", " optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)\n", " with mlflow.start_run(nested=nested):\n", " params = {\n", " k: o\n", " for k, o in vars(args).items()\n", " }\n", " sm_training_env = json.loads(os.environ['SM_TRAINING_ENV'])\n", " job_name = sm_training_env['job_name']\n", " job_uri = f'https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{job_name}'\n", " mlflow.log_params(\n", " {**params, 'sagemaker_job_name': job_name, 'sagemaker_job_uri': job_uri}\n", " )\n", "\n", " # Log model summary.\n", " with open('model_summary.txt', 'w') as f:\n", " f.write(str(summary(model)))\n", " mlflow.log_artifact('model_summary.txt')\n", "\n", " for epoch in range(1, args.epochs + 1):\n", " model.train()\n", " for batch_idx, (data, target) in enumerate(train_loader, 1):\n", " data, target = data.to(device), target.to(device)\n", " optimizer.zero_grad()\n", " output = model(data)\n", " loss = F.nll_loss(output, target)\n", " loss.backward()\n", " if is_distributed and not use_cuda:\n", " # average gradients manually for multi-machine cpu case only\n", " _average_gradients(model)\n", " optimizer.step()\n", " if batch_idx % args.log_interval == 0:\n", " logger.info(\n", " 'Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(\n", " epoch,\n", " batch_idx * len(data),\n", " len(train_loader.sampler),\n", " 100.0 * batch_idx / len(train_loader),\n", " loss.item(),\n", " )\n", " )\n", " mlflow.log_metric(\n", " 'loss',\n", " loss.item(),\n", " step=(batch_idx // args.log_interval),\n", " )\n", "\n", " test(model, test_loader, device)\n", " save_model(model, args.model_dir)\n", "\n", "\n", "def test(model, test_loader, device):\n", " model.eval()\n", " test_loss = 0\n", " correct = 0\n", " with torch.no_grad():\n", " for data, target in test_loader:\n", " data, target = data.to(device), target.to(device)\n", " output = model(data)\n", " test_loss += F.nll_loss(\n", " output, target, size_average=False\n", " ).item() # sum up batch loss\n", " pred = output.max(1, keepdim=True)[\n", " 1\n", " ] # get the index of the max log-probability\n", " correct += pred.eq(target.view_as(pred)).sum().item()\n", "\n", " test_loss /= len(test_loader.dataset)\n", " logger.info(\n", " 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n", " test_loss,\n", " correct,\n", " len(test_loader.dataset),\n", " 100.0 * correct / len(test_loader.dataset),\n", " )\n", " )\n", " mlflow.log_metrics(\n", " {\n", " 'test_average_loss': test_loss,\n", " 'test_accuracy': correct / len(test_loader.dataset),\n", " }\n", " )\n", "\n", "def model_fn(model_dir):\n", " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " model = torch.nn.DataParallel(Net())\n", " with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:\n", " model.load_state_dict(torch.load(f))\n", " return model.to(device)\n", "\n", "\n", "def save_model(model, model_dir):\n", " logger.info('Saving the model.')\n", " path = os.path.join(model_dir, 'model.pth')\n", " # recommended way from http://pytorch.org/docs/master/notes/serialization.html\n", " torch.save(model.cpu().state_dict(), path)\n", "\n", "\n", "if __name__ == '__main__':\n", " parser = argparse.ArgumentParser()\n", "\n", " # Data and model checkpoints directories\n", " parser.add_argument(\n", " '--batch-size',\n", " type=int,\n", " default=64,\n", " metavar='N',\n", " help='input batch size for training (default: 64)',\n", " )\n", " parser.add_argument(\n", " '--test-batch-size',\n", " type=int,\n", " default=1000,\n", " metavar='N',\n", " help='input batch size for testing (default: 1000)',\n", " )\n", " parser.add_argument(\n", " '--epochs',\n", " type=int,\n", " default=10,\n", " metavar='N',\n", " help='number of epochs to train (default: 10)',\n", " )\n", " parser.add_argument(\n", " '--lr',\n", " type=float,\n", " default=0.01,\n", " metavar='LR',\n", " help='learning rate (default: 0.01)',\n", " )\n", " parser.add_argument(\n", " '--momentum',\n", " type=float,\n", " default=0.5,\n", " metavar='M',\n", " help='SGD momentum (default: 0.5)',\n", " )\n", " parser.add_argument(\n", " '--seed', type=int, default=1, metavar='S', help='random seed (default: 1)'\n", " )\n", " parser.add_argument(\n", " '--log-interval',\n", " type=int,\n", " default=100,\n", " metavar='N',\n", " help='how many batches to wait before logging training status',\n", " )\n", " parser.add_argument(\n", " '--backend',\n", " type=str,\n", " default=None,\n", " help='backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)',\n", " )\n", "\n", " # Container environment\n", " parser.add_argument(\n", " '--hosts', type=list, default=json.loads(os.environ['SM_HOSTS'])\n", " )\n", " parser.add_argument(\n", " '--current-host', type=str, default=os.environ['SM_CURRENT_HOST']\n", " )\n", " parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])\n", " parser.add_argument(\n", " '--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING']\n", " )\n", " parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])\n", "\n", " args = parser.parse_args()\n", "\n", " mlflow.set_experiment(mlflow_experiment_name)\n", " if parent_run_id:\n", " with mlflow.start_run(run_id=parent_run_id):\n", " train(args)\n", " else:\n", " train(args)" ] }, { "cell_type": "markdown", "id": "6f46e27a-717e-4587-8ec0-71ee28f2ded6", "metadata": {}, "source": [ "Since we're using MLflow in our training script, let's make sure the container installs `mlflow` along with our MLflow AWS plugin before running our training script. We can do this by creating a `requirements.txt` file and putting it in the same directory as our training script." ] }, { "cell_type": "code", "execution_count": null, "id": "183f77d7-0189-4c06-a5ee-e491e17ac172", "metadata": {}, "outputs": [], "source": [ "%%writefile training_code/requirements.txt\n", "mlflow==2.13.2\n", "torchinfo\n", "sagemaker-mlflow==0.1.0" ] }, { "cell_type": "markdown", "id": "314a2fd2-4c5c-4aae-9ab5-419e0d7fe9f9", "metadata": {}, "source": [ "## SageMaker HPO and MLflow" ] }, { "cell_type": "code", "execution_count": null, "id": "f1fa6668-571c-49a7-ac9f-d6638f39393e", "metadata": { "tags": [] }, "outputs": [], "source": [ "hyperparameter_ranges = {\n", " \"lr\": ContinuousParameter(0.001, 0.1),\n", " \"batch-size\": CategoricalParameter([32, 64, 128, 256, 512]),\n", "}\n", "\n", "objective_metric_name = \"average test loss\"\n", "objective_type = \"Minimize\"\n", "metric_definitions = [{\"Name\": \"average test loss\", \"Regex\": \"Test set: Average loss: ([0-9\\\\.]+)\"}]" ] }, { "cell_type": "markdown", "id": "248e3cbc-6835-4d55-94b3-6a3dd873f4d2", "metadata": {}, "source": [ "Create a MLflow experiment called `MNIST`. We'll give this SageMaker HPO job a run name, `HPODemo`. Each training attempt will be its own child run under `HPODemo`." ] }, { "cell_type": "code", "execution_count": null, "id": "8b9129f0-c485-4e22-ab41-55dec0e4672b", "metadata": { "tags": [] }, "outputs": [], "source": [ "mlflow.set_tracking_uri(tracking_server_arn)\n", "experiment = mlflow.set_experiment(experiment_name)\n", "\n", "with mlflow.start_run(run_name=sagemaker.utils.name_from_base(\"HPODemo\")) as run:\n", " estimator = PyTorch(\n", " entry_point=\"mnist.py\",\n", " source_dir=\"training_code\",\n", " role=role,\n", " py_version=\"py39\",\n", " framework_version=\"1.13\",\n", " instance_count=1,\n", " instance_type=\"ml.c5.2xlarge\",\n", " hyperparameters={\"epochs\": 5, \"backend\": \"gloo\"},\n", " environment={\n", " \"MLFLOW_TRACKING_URI\": tracking_server_arn,\n", " \"MLFLOW_EXPERIMENT_NAME\": experiment.name,\n", " \"MLFLOW_PARENT_RUN_ID\": run.info.run_id,\n", " },\n", " )\n", "\n", " tuner = HyperparameterTuner(\n", " estimator,\n", " objective_metric_name,\n", " hyperparameter_ranges,\n", " metric_definitions,\n", " max_jobs=9,\n", " max_parallel_jobs=3,\n", " objective_type=objective_type,\n", " )\n", " tuner.fit({\"training\": train_input})" ] }, { "cell_type": "markdown", "id": "3ebae82c", "metadata": {}, "source": [ "## Notebook CI Test Results\n", "\n", "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", "\n", "\n", "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)\n", "\n", "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/sagemaker-mlflow|sagemaker_hpo_mlflow.ipynb)" ] } ], "metadata": { "availableInstances": [ { "_defaultOrder": 0, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.t3.medium", "vcpuNum": 2 }, { "_defaultOrder": 1, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.t3.large", "vcpuNum": 2 }, { "_defaultOrder": 2, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.t3.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 3, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.t3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 4, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5.large", "vcpuNum": 2 }, { "_defaultOrder": 5, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 6, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 7, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 8, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 9, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 10, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 11, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 12, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5d.large", "vcpuNum": 2 }, { "_defaultOrder": 13, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5d.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 14, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5d.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 15, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5d.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 16, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5d.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 17, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5d.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 18, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5d.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 19, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 20, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": true, "memoryGiB": 0, "name": "ml.geospatial.interactive", "supportedImageNames": [ "sagemaker-geospatial-v1-0" ], "vcpuNum": 0 }, { "_defaultOrder": 21, "_isFastLaunch": true, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.c5.large", "vcpuNum": 2 }, { "_defaultOrder": 22, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.c5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 23, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.c5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 24, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.c5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 25, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 72, "name": "ml.c5.9xlarge", "vcpuNum": 36 }, { "_defaultOrder": 26, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 96, "name": "ml.c5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 27, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 144, "name": "ml.c5.18xlarge", "vcpuNum": 72 }, { "_defaultOrder": 28, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.c5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 29, "_isFastLaunch": true, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g4dn.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 30, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g4dn.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 31, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g4dn.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 32, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g4dn.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 33, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g4dn.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 34, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g4dn.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 35, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 61, "name": "ml.p3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 36, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 244, "name": "ml.p3.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 37, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 488, "name": "ml.p3.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 38, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.p3dn.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 39, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.r5.large", "vcpuNum": 2 }, { "_defaultOrder": 40, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.r5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 41, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.r5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 42, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.r5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 43, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.r5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 44, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.r5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 45, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 512, "name": "ml.r5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 46, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.r5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 47, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 48, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 49, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 50, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 51, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 52, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 53, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.g5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 54, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.g5.48xlarge", "vcpuNum": 192 }, { "_defaultOrder": 55, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 56, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4de.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 57, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.trn1.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 58, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 512, "name": "ml.trn1.32xlarge", "vcpuNum": 128 }, { "_defaultOrder": 59, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 512, "name": "ml.trn1n.32xlarge", "vcpuNum": 128 } ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }