Skip to content
This repository was archived by the owner on Sep 20, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/emd/cfn/codepipeline/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ Resources:
TemplateConfiguration: BuildOutput::parameters.json
Capabilities: CAPABILITY_IAM,CAPABILITY_NAMED_IAM
RoleArn: !GetAtt CloudFormationServiceRole.Arn
TimeoutInMinutes: 45
InputArtifacts:
- Name: BuildOutput
RunOrder: 1
8 changes: 8 additions & 0 deletions src/emd/cfn/ecs/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,14 @@ Resources:
- Type: forward
TargetGroupArn: !Ref ServiceTargetGroup

ForceAPIRouterDeployment:
Type: Custom::ForceAPIRouterDeployment
DependsOn: Service
Properties:
ServiceToken: !Ref LambdaDeploymentHelperArn
# Adding a timestamp parameter to ensure this resource is updated when needed
Timestamp: !Ref "AWS::StackName"

Outputs:
Model:
Description: Model ID used to generate the response.
Expand Down
126 changes: 95 additions & 31 deletions src/emd/cfn/shared/ecs_cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,8 @@ Resources:
Action:
- ecs:PutClusterCapacityProviders
- ecs:DescribeClusters
- ecs:UpdateService
- ecs:DescribeServices
- logs:CreateLogGroup
- logs:CreateLogStream
- logs:PutLogEvents
Expand All @@ -162,40 +164,70 @@ Resources:
ecs_client = boto3.client('ecs')
cluster_name = os.environ['ECS_CLUSTER_NAME']
capacity_provider_name = event['ResourceProperties']['CapacityProvider']

def try_update_with_retry():
# Simple retry mechanism - try twice with a delay
try:
return _do_update()
except Exception as e:
if 'UpdateInProgressException' in str(e):
print("Cluster busy, waiting 30 seconds before retry...")
import time
time.sleep(30)
return _do_update() # Try once more
else:
raise # Re-raise if it's not the specific error we're handling

def _do_update():
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
current_capacity_providers = cluster_info.get('capacityProviders', [])

if capacity_provider_name not in current_capacity_providers:
current_capacity_providers.append(capacity_provider_name)

return ecs_client.put_cluster_capacity_providers(
cluster=cluster_name,
capacityProviders=current_capacity_providers,
defaultCapacityProviderStrategy=[
{
'capacityProvider': capacity_provider_name,
'weight': 1,
'base': 0
}
]
)

try:
if event['RequestType'] in ['Create', 'Update']:
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
current_capacity_providers = cluster_info.get('capacityProviders', [])

if capacity_provider_name not in current_capacity_providers:
current_capacity_providers.append(capacity_provider_name)

ecs_client.put_cluster_capacity_providers(
cluster=cluster_name,
capacityProviders=current_capacity_providers,
defaultCapacityProviderStrategy=[
{
'capacityProvider': capacity_provider_name,
'weight': 1,
'base': 0
}
]
)
try_update_with_retry()
elif event['RequestType'] == 'Delete':
# Retrieve current capacity providers
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
current_capacity_providers = cluster_info.get('capacityProviders', [])

# Remove only the specific capacity provider
updated_capacity_providers = [
cp for cp in current_capacity_providers if cp != capacity_provider_name
]

ecs_client.put_cluster_capacity_providers(
cluster=cluster_name,
capacityProviders=updated_capacity_providers,
defaultCapacityProviderStrategy=[]
)
def _do_delete():
# Retrieve current capacity providers
cluster_info = ecs_client.describe_clusters(clusters=[cluster_name])['clusters'][0]
current_capacity_providers = cluster_info.get('capacityProviders', [])

# Remove only the specific capacity provider
updated_capacity_providers = [
cp for cp in current_capacity_providers if cp != capacity_provider_name
]

return ecs_client.put_cluster_capacity_providers(
cluster=cluster_name,
capacityProviders=updated_capacity_providers,
defaultCapacityProviderStrategy=[]
)

# Simple retry for delete operation too
try:
_do_delete()
except Exception as e:
if 'UpdateInProgressException' in str(e):
print("Cluster busy during delete, waiting 30 seconds before retry...")
import time
time.sleep(30)
_do_delete() # Try once more
else:
raise
cfnresponse.send(event, context, cfnresponse.SUCCESS, {})
except Exception as e:
cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)})
Expand Down Expand Up @@ -240,6 +272,38 @@ Resources:
response_data = {'DnsName': dns_name}
cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data)

def force_api_router_deployment(event, context):
"""
Forces a new deployment for the APIRouterService.
This will restart the service with the latest task definition.
"""
ecs_client = boto3.client('ecs')
cluster_name = os.environ['ECS_CLUSTER_NAME']
service_name = "EMD-API-Router"

try:
# Check if the service exists
response = ecs_client.describe_services(
cluster=cluster_name,
services=[service_name]
)

if not response['services'] or response['services'][0]['status'] != 'ACTIVE':
raise Exception(f"Service {service_name} not found or not active in cluster {cluster_name}")

# Force a new deployment
ecs_client.update_service(
cluster=cluster_name,
service=service_name,
forceNewDeployment=True
)

response_data = {'Message': f"Forced new deployment for {service_name}"}
cfnresponse.send(event, context, cfnresponse.SUCCESS, response_data)
except Exception as e:
print(f"Error forcing deployment: {str(e)}")
cfnresponse.send(event, context, cfnresponse.FAILED, {'Error': str(e)})

def handler(event, context):
print(event)
print(context)
Expand Down
2 changes: 1 addition & 1 deletion src/emd/cfn/shared/openai_router/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ toolchain go1.24.2

require (
github.com/aws/aws-sdk-go v1.54.0
github.com/gin-gonic/gin v1.8.1
github.com/gin-gonic/gin v1.9.1
)

require (
Expand Down