Bulk Model Retraining

Batch retraining of multiple models concurrently using a Map state. Supports scheduled monthly retraining runs across all production models.

{
  "Comment": "Batch retraining of multiple models concurrently using a Map state with MaxConcurrency 3. Supports scheduled monthly retraining runs across all production models.",
  "StartAt": "BatchRetrainModels",
  "States": {
    "BatchRetrainModels": {
      "Type": "Map",
      "ItemsPath": "$.models",
      "MaxConcurrency": 3,
      "ItemProcessor": {
        "ProcessorConfig": {
          "Mode": "INLINE"
        },
        "StartAt": "BatchIngestData",
        "States": {
          "BatchIngestData": {
            "Type": "Task",
            "Resource": "${IngestTrainingDataFunctionArn}",
            "Retry": [
              {
                "ErrorEquals": [
                  "States.ALL"
                ],
                "IntervalSeconds": 3,
                "MaxAttempts": 2,
                "BackoffRate": 2
              }
            ],
            "Next": "BatchPreprocessFeatures"
          },
          "BatchPreprocessFeatures": {
            "Type": "Task",
            "Resource": "${PreprocessFeaturesFunctionArn}",
            "Retry": [
              {
                "ErrorEquals": [
                  "States.ALL"
                ],
                "IntervalSeconds": 3,
                "MaxAttempts": 2,
                "BackoffRate": 2
              }
            ],
            "Next": "BatchLaunchTrainingJob"
          },
          "BatchLaunchTrainingJob": {
            "Type": "Task",
            "Resource": "${LaunchTrainingJobFunctionArn}",
            "Retry": [
              {
                "ErrorEquals": [
                  "States.ALL"
                ],
                "IntervalSeconds": 5,
                "MaxAttempts": 2,
                "BackoffRate": 2
              }
            ],
            "Next": "BatchEvaluateModel"
          },
          "BatchEvaluateModel": {
            "Type": "Task",
            "Resource": "${EvaluateModelFunctionArn}",
            "Retry": [
              {
                "ErrorEquals": [
                  "States.ALL"
                ],
                "IntervalSeconds": 3,
                "MaxAttempts": 2,
                "BackoffRate": 2
              }
            ],
            "Next": "BatchParallelRegisterAndDeploy"
          },
          "BatchParallelRegisterAndDeploy": {
            "Type": "Parallel",
            "Branches": [
              {
                "StartAt": "BatchRegisterModel",
                "States": {
                  "BatchRegisterModel": {
                    "Type": "Task",
                    "Resource": "${RegisterModelFunctionArn}",
                    "Retry": [
                      {
                        "ErrorEquals": [
                          "States.ALL"
                        ],
                        "IntervalSeconds": 3,
                        "MaxAttempts": 2,
                        "BackoffRate": 2
                      }
                    ],
                    "End": true
                  }
                }
              },
              {
                "StartAt": "BatchDeployEndpoint",
                "States": {
                  "BatchDeployEndpoint": {
                    "Type": "Task",
                    "Resource": "${DeployModelEndpointFunctionArn}",
                    "Retry": [
                      {
                        "ErrorEquals": [
                          "States.ALL"
                        ],
                        "IntervalSeconds": 5,
                        "MaxAttempts": 2,
                        "BackoffRate": 2
                      }
                    ],
                    "End": true
                  }
                }
              }
            ],
            "Next": "BatchNotifyStatus"
          },
          "BatchNotifyStatus": {
            "Type": "Task",
            "Resource": "${NotifyTrainingStatusFunctionArn}",
            "Retry": [
              {
                "ErrorEquals": [
                  "States.ALL"
                ],
                "IntervalSeconds": 2,
                "MaxAttempts": 2,
                "BackoffRate": 2
              }
            ],
            "End": true
          }
        }
      },
      "Next": "BatchRetrainingComplete"
    },
    "BatchRetrainingComplete": {
      "Type": "Succeed"
    }
  }
}
JSON
Expand
100%

AI teams can use patterns like this to build reliable, compliant, and scalable automation for payment systems and can test and refine these flows locally with Thrubit to reduce cloud cost and speed up iteration.

Free Trial