From 3abc6d3bdb852520db0cc699c98aa49707c5ed0e Mon Sep 17 00:00:00 2001 From: Yi Yan Date: Mon, 28 Apr 2025 10:26:25 +0800 Subject: [PATCH 1/5] fix: update deployment function --- src/emd/cfn/ecs/template.yaml | 8 +++++++ src/emd/cfn/shared/ecs_cluster.yaml | 34 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/emd/cfn/ecs/template.yaml b/src/emd/cfn/ecs/template.yaml index 975e5aca..d92b15f8 100644 --- a/src/emd/cfn/ecs/template.yaml +++ b/src/emd/cfn/ecs/template.yaml @@ -386,6 +386,14 @@ Resources: - Type: forward TargetGroupArn: !Ref ServiceTargetGroup + ForceAPIRouterDeployment: + Type: Custom::ForceAPIRouterDeployment + DependsOn: Service + Properties: + ServiceToken: !Ref LambdaDeploymentHelperArn + # Adding a timestamp parameter to ensure this resource is updated when needed + Timestamp: !Ref "AWS::StackName" + Outputs: Model: Description: Model ID used to generate the response. diff --git a/src/emd/cfn/shared/ecs_cluster.yaml b/src/emd/cfn/shared/ecs_cluster.yaml index 76f226c7..e0927a57 100644 --- a/src/emd/cfn/shared/ecs_cluster.yaml +++ b/src/emd/cfn/shared/ecs_cluster.yaml @@ -140,6 +140,8 @@ Resources: Action: - ecs:PutClusterCapacityProviders - ecs:DescribeClusters + - ecs:UpdateService + - ecs:DescribeServices - logs:CreateLogGroup - logs:CreateLogStream - logs:PutLogEvents @@ -239,6 +241,38 @@ Resources: response_data = {'DnsName': dns_name} cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data) + + def force_api_router_deployment(event, context): + """ + Forces a new deployment for the APIRouterService. + This will restart the service with the latest task definition. + """ + ecs_client = boto3.client('ecs') + cluster_name = os.environ['ECS_CLUSTER_NAME'] + service_name = "EMD-API-Router" + + try: + # Check if the service exists + response = ecs_client.describe_services( + cluster=cluster_name, + services=[service_name] + ) + + if not response['services'] or response['services'][0]['status'] != 'ACTIVE': + raise Exception(f"Service {service_name} not found or not active in cluster {cluster_name}") + + # Force a new deployment + ecs_client.update_service( + cluster=cluster_name, + service=service_name, + forceNewDeployment=True + ) + + response_data = {'Message': f"Forced new deployment for {service_name}"} + cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data) + except Exception as e: + print(f"Error forcing deployment: {str(e)}") + cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)}) def handler(event, context): print(event) From a7ba8efb2ba7f89471c1596389fc54d68ac9d9ea Mon Sep 17 00:00:00 2001 From: Yi Yan Date: Mon, 28 Apr 2025 10:29:23 +0800 Subject: [PATCH 2/5] fix: code cleanup --- src/emd/cfn/shared/ecs_cluster.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/emd/cfn/shared/ecs_cluster.yaml b/src/emd/cfn/shared/ecs_cluster.yaml index e0927a57..ea2d2472 100644 --- a/src/emd/cfn/shared/ecs_cluster.yaml +++ b/src/emd/cfn/shared/ecs_cluster.yaml @@ -241,7 +241,7 @@ Resources: response_data = {'DnsName': dns_name} cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data) - + def force_api_router_deployment(event, context): """ Forces a new deployment for the APIRouterService. @@ -250,24 +250,24 @@ Resources: ecs_client = boto3.client('ecs') cluster_name = os.environ['ECS_CLUSTER_NAME'] service_name = "EMD-API-Router" - + try: # Check if the service exists response = ecs_client.describe_services( cluster=cluster_name, services=[service_name] ) - + if not response['services'] or response['services'][0]['status'] != 'ACTIVE': raise Exception(f"Service {service_name} not found or not active in cluster {cluster_name}") - + # Force a new deployment ecs_client.update_service( cluster=cluster_name, service=service_name, forceNewDeployment=True ) - + response_data = {'Message': f"Forced new deployment for {service_name}"} cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data) except Exception as e: From 36434ce46eeccce8118b0f4addd1c4ea6681b205 Mon Sep 17 00:00:00 2001 From: Yi Yan Date: Mon, 28 Apr 2025 11:12:20 +0800 Subject: [PATCH 3/5] fix: retry for updating capacity provider service --- src/emd/cfn/shared/ecs_cluster.yaml | 92 +++++++++++++++++++---------- 1 file changed, 61 insertions(+), 31 deletions(-) diff --git a/src/emd/cfn/shared/ecs_cluster.yaml b/src/emd/cfn/shared/ecs_cluster.yaml index ea2d2472..3ea8b9a9 100644 --- a/src/emd/cfn/shared/ecs_cluster.yaml +++ b/src/emd/cfn/shared/ecs_cluster.yaml @@ -164,40 +164,70 @@ Resources: ecs_client = boto3.client('ecs') cluster_name = os.environ['ECS_CLUSTER_NAME'] capacity_provider_name = event['ResourceProperties']['CapacityProvider'] + + def try_update_with_retry(): + # Simple retry mechanism - try twice with a delay + try: + return _do_update() + except Exception as e: + if 'UpdateInProgressException' in str(e): + print("Cluster busy, waiting 30 seconds before retry...") + import time + time.sleep(30) + return _do_update() # Try once more + else: + raise # Re-raise if it's not the specific error we're handling + + def _do_update(): + cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0] + current_capacity_providers = cluster_info.get('capacityProviders', []) + + if capacity_provider_name not in current_capacity_providers: + current_capacity_providers.append(capacity_provider_name) + + return ecs_client.put_cluster_capacity_providers( + cluster=cluster_name, + capacityProviders=current_capacity_providers, + defaultCapacityProviderStrategy=[ + { + 'capacityProvider': capacity_provider_name, + 'weight': 1, + 'base': 0 + } + ] + ) + try: if event['RequestType'] in ['Create', 'Update']: - cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0] - current_capacity_providers = cluster_info.get('capacityProviders', []) - - if capacity_provider_name not in current_capacity_providers: - current_capacity_providers.append(capacity_provider_name) - - ecs_client.put_cluster_capacity_providers( - cluster=cluster_name, - capacityProviders=current_capacity_providers, - defaultCapacityProviderStrategy=[ - { - 'capacityProvider': capacity_provider_name, - 'weight': 1, - 'base': 0 - } - ] - ) + try_update_with_retry() elif event['RequestType'] == 'Delete': - # Retrieve current capacity providers - cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0] - current_capacity_providers = cluster_info.get('capacityProviders', []) - - # Remove only the specific capacity provider - updated_capacity_providers = [ - cp for cp in current_capacity_providers if cp != capacity_provider_name - ] - - ecs_client.put_cluster_capacity_providers( - cluster=cluster_name, - capacityProviders=updated_capacity_providers, - defaultCapacityProviderStrategy=[] - ) + def _do_delete(): + # Retrieve current capacity providers + cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0] + current_capacity_providers = cluster_info.get('capacityProviders', []) + + # Remove only the specific capacity provider + updated_capacity_providers = [ + cp for cp in current_capacity_providers if cp != capacity_provider_name + ] + + return ecs_client.put_cluster_capacity_providers( + cluster=cluster_name, + capacityProviders=updated_capacity_providers, + defaultCapacityProviderStrategy=[] + ) + + # Simple retry for delete operation too + try: + _do_delete() + except Exception as e: + if 'UpdateInProgressException' in str(e): + print("Cluster busy during delete, waiting 30 seconds before retry...") + import time + time.sleep(30) + _do_delete() # Try once more + else: + raise cfnresponse.send(event, context, cfnresponse.SUCCESS, {}) except Exception as e: cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)}) From 6a5e6571f9fe5d7a0f90fc4aea282c3f98536c47 Mon Sep 17 00:00:00 2001 From: Yi Yan Date: Mon, 28 Apr 2025 11:26:24 +0800 Subject: [PATCH 4/5] chore: update gin --- src/emd/cfn/shared/openai_router/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/emd/cfn/shared/openai_router/go.mod b/src/emd/cfn/shared/openai_router/go.mod index a116885d..18359f6e 100644 --- a/src/emd/cfn/shared/openai_router/go.mod +++ b/src/emd/cfn/shared/openai_router/go.mod @@ -6,7 +6,7 @@ toolchain go1.24.2 require ( github.com/aws/aws-sdk-go v1.54.0 - github.com/gin-gonic/gin v1.8.1 + github.com/gin-gonic/gin v1.9.1 ) require ( From c42d13135d199f2ccdb932d277ebd62f716f8e11 Mon Sep 17 00:00:00 2001 From: Yi Yan Date: Mon, 28 Apr 2025 11:51:25 +0800 Subject: [PATCH 5/5] fix: removed timeout parameter --- src/emd/cfn/codepipeline/template.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/emd/cfn/codepipeline/template.yaml b/src/emd/cfn/codepipeline/template.yaml index 564a83db..11a0e53e 100644 --- a/src/emd/cfn/codepipeline/template.yaml +++ b/src/emd/cfn/codepipeline/template.yaml @@ -356,7 +356,6 @@ Resources: TemplateConfiguration: BuildOutput::parameters.json Capabilities: CAPABILITY_IAM,CAPABILITY_NAMED_IAM RoleArn: !GetAtt CloudFormationServiceRole.Arn - TimeoutInMinutes: 45 InputArtifacts: - Name: BuildOutput RunOrder: 1