Skip to content

Commit 266832a

Browse files
authored
fix(ci): avoid testinfra failure due to loss of ssh connection (#1764)
* fix(ci): replace EC2 Instance Connect with cloud-init SSH key injection Tests are often failing due to the SSH access to the instance. EC2 Instance Connect push the temporary SSH key which is then available only for 60 seconds. Recently, errors often occur when the SSH key is sent to the instance, resulting in a timeout. We replace runtime SSH key injection via EC2 Instance Connect API with cloud-init configuration to add the SSH public key during instance initialization. Note that we are still using EC2 Instance Connect to create the SSH key pair, but we are not using it to push the key to the instance. * fix(ci): terminate only the ec2 instance of the matrix job For the moment, the first matrix job that finishes will terminate all the ec2 instances running in the current workflow run. This is not what we want. This change only terminates the instance that is running the matrix job. * fix(ci): init.sh completion check should not be blocking Add optional timeout parameter to run_ssh_command() to check init completion status with a 5-second timeout.
1 parent bed7a3e commit 266832a

File tree

5 files changed

+39
-34
lines changed

5 files changed

+39
-34
lines changed

.github/workflows/ami-release-nix-single.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ jobs:
4949
trusted-public-keys = nix-postgres-artifacts:dGZlQOvKcNEjvT7QEAJbcV6b6uk7VF/hWMjhYleiaLI=% cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY=
5050
5151
- name: Set PostgreSQL version environment variable
52-
run: echo "POSTGRES_MAJOR_VERSION=${{ github.event.inputs.postgres_version }}" >> $GITHUB_ENV
52+
run: |
53+
echo "POSTGRES_MAJOR_VERSION=${{ github.event.inputs.postgres_version }}" >> $GITHUB_ENV
54+
echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV
5355
5456
- name: Generate common-nix.vars.pkr.hcl
5557
run: |
@@ -65,7 +67,7 @@ jobs:
6567
run: |
6668
GIT_SHA=${{ steps.get_sha.outputs.sha }}
6769
nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl
68-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
70+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
6971
7072
- name: Build AMI stage 2
7173
env:
@@ -74,7 +76,7 @@ jobs:
7476
GIT_SHA=${{ steps.get_sha.outputs.sha }}
7577
nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl
7678
POSTGRES_MAJOR_VERSION=${{ env.POSTGRES_MAJOR_VERSION }}
77-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl
79+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl
7880
7981
- name: Grab release version
8082
id: process_release_version
@@ -153,10 +155,10 @@ jobs:
153155
- name: Cleanup resources after build
154156
if: ${{ always() }}
155157
run: |
156-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
158+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
157159
158160
- name: Cleanup resources on build cancellation
159161
if: ${{ cancelled() }}
160162
run: |
161-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
163+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
162164

.github/workflows/ami-release-nix.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@ jobs:
7777
fi
7878
7979
- name: Set PostgreSQL version environment variable
80-
run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
80+
run: |
81+
echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
82+
echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV
8183
8284
- name: Generate common-nix.vars.pkr.hcl
8385
run: |
@@ -94,7 +96,7 @@ jobs:
9496
GIT_SHA=${{github.sha}}
9597
nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl
9698
# why is postgresql_major defined here instead of where the _three_ other postgresql_* variables are defined?
97-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
99+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
98100
99101
- name: Build AMI stage 2
100102
env:
@@ -103,7 +105,7 @@ jobs:
103105
GIT_SHA=${{github.sha}}
104106
nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl
105107
POSTGRES_MAJOR_VERSION=${{ env.POSTGRES_MAJOR_VERSION }}
106-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl
108+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git_sha=${GIT_SHA}" -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" stage2-nix-psql.pkr.hcl
107109
108110
- name: Grab release version
109111
id: process_release_version
@@ -182,9 +184,9 @@ jobs:
182184
- name: Cleanup resources after build
183185
if: ${{ always() }}
184186
run: |
185-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
187+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
186188
187189
- name: Cleanup resources on build cancellation
188190
if: ${{ cancelled() }}
189191
run: |
190-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
192+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids

.github/workflows/qemu-image-build.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ jobs:
6565
sudo chmod 666 /dev/kvm
6666
6767
- name: Set PostgreSQL version environment variable
68-
run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
68+
run: |
69+
echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
70+
echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV
6971
7072
- name: Generate common-nix.vars.pkr.hcl
7173
run: |
@@ -155,9 +157,9 @@ jobs:
155157
- name: Cleanup resources after build
156158
if: ${{ always() }}
157159
run: |
158-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
160+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
159161
160162
- name: Cleanup resources on build cancellation
161163
if: ${{ cancelled() }}
162164
run: |
163-
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids
165+
aws ec2 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --instance-ids

.github/workflows/testinfra-ami-build.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ jobs:
9696
run: echo "random_string=$(openssl rand -hex 8)" >> $GITHUB_OUTPUT
9797

9898
- name: Set PostgreSQL version environment variable
99-
run: echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
99+
run: |
100+
echo "POSTGRES_MAJOR_VERSION=${{ matrix.postgres_version }}" >> $GITHUB_ENV
101+
echo "EXECUTION_ID=${{ github.run_id }}-${{ matrix.postgres_version }}" >> $GITHUB_ENV
100102
101103
- name: Generate common-nix.vars.pkr.hcl
102104
run: |
@@ -110,13 +112,13 @@ jobs:
110112
run: |
111113
GIT_SHA=${{github.sha}}
112114
nix run github:supabase/postgres/${GIT_SHA}#packer -- init amazon-arm64-nix.pkr.hcl
113-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
115+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "ansible_arguments=" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "ansible_arguments=-e postgresql_major=${POSTGRES_MAJOR_VERSION}" amazon-arm64-nix.pkr.hcl
114116
115117
- name: Build AMI stage 2
116118
run: |
117119
GIT_SHA=${{github.sha}}
118120
nix run github:supabase/postgres/${GIT_SHA}#packer -- init stage2-nix-psql.pkr.hcl
119-
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${GITHUB_RUN_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "git_sha=${GITHUB_SHA}" stage2-nix-psql.pkr.hcl
121+
nix run github:supabase/postgres/${GIT_SHA}#packer -- build -var "git-head-version=${GIT_SHA}" -var "packer-execution-id=${EXECUTION_ID}" -var "postgres_major_version=${POSTGRES_MAJOR_VERSION}" -var-file="development-arm.vars.pkr.hcl" -var-file="common-nix.vars.pkr.hcl" -var "postgres-version=${{ steps.random.outputs.random_string }}" -var "region=ap-southeast-1" -var 'ami_regions=["ap-southeast-1"]' -var "force-deregister=true" -var "git_sha=${GITHUB_SHA}" stage2-nix-psql.pkr.hcl
120122
121123
- name: Run tests
122124
timeout-minutes: 10
@@ -130,12 +132,12 @@ jobs:
130132
- name: Cleanup resources on build cancellation
131133
if: ${{ cancelled() }}
132134
run: |
133-
aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:packerExecutionId,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids
135+
aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:packerExecutionId,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids
134136
135137
- name: Cleanup resources after build
136138
if: ${{ always() }}
137139
run: |
138-
aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:testinfra-run-id,Values=${GITHUB_RUN_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids || true
140+
aws ec2 --region ap-southeast-1 describe-instances --filters "Name=tag:testinfra-run-id,Values=${EXECUTION_ID}" --query "Reservations[].Instances[].InstanceId" --output text | xargs -r aws ec2 terminate-instances --region ap-southeast-1 --instance-ids || true
139141
140142
- name: Cleanup AMIs
141143
if: always()

testinfra/test_ami_nix.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@
99
from ec2instanceconnectcli.EC2InstanceConnectLogger import EC2InstanceConnectLogger
1010
from ec2instanceconnectcli.EC2InstanceConnectKey import EC2InstanceConnectKey
1111
from time import sleep
12-
import subprocess
1312
import paramiko
1413

15-
# if GITHUB_RUN_ID is not set, use a default value that includes the user and hostname
14+
# if EXECUTION_ID is not set, use a default value that includes the user and hostname
1615
RUN_ID = os.environ.get(
17-
"GITHUB_RUN_ID",
16+
"EXECUTION_ID",
1817
"unknown-ci-run-"
1918
+ os.environ.get("USER", "unknown-user")
2019
+ "@"
@@ -206,9 +205,9 @@ def get_ssh_connection(instance_ip, ssh_identity_file, max_retries=10):
206205
sleep(5)
207206

208207

209-
def run_ssh_command(ssh, command):
208+
def run_ssh_command(ssh, command, timeout=None):
210209
"""Run a command over the established SSH connection."""
211-
stdin, stdout, stderr = ssh.exec_command(command)
210+
stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout)
212211
exit_code = stdout.channel.recv_exit_status()
213212
return {
214213
'succeeded': exit_code == 0,
@@ -233,6 +232,10 @@ def host():
233232
def gzip_then_base64_encode(s: str) -> str:
234233
return base64.b64encode(gzip.compress(s.encode())).decode()
235234

235+
# Create temporary SSH key pair
236+
ec2logger = EC2InstanceConnectLogger(debug=False)
237+
temp_key = EC2InstanceConnectKey(ec2logger.get_logger())
238+
236239
instance = list(
237240
ec2.create_instances(
238241
BlockDeviceMappings=[
@@ -279,6 +282,10 @@ def gzip_then_base64_encode(s: str) -> str:
279282
- 'bash init.sh "staging"'
280283
- 'touch /var/lib/init-complete'
281284
- 'rm -rf /tmp/*'
285+
users:
286+
- name: ubuntu
287+
ssh_authorized_keys:
288+
- {temp_key.get_pub_key()}
282289
""",
283290
TagSpecifications=[
284291
{
@@ -297,16 +304,6 @@ def gzip_then_base64_encode(s: str) -> str:
297304
# Increase wait time before starting health checks
298305
sleep(30) # Wait for 30 seconds to allow services to start
299306

300-
ec2logger = EC2InstanceConnectLogger(debug=False)
301-
temp_key = EC2InstanceConnectKey(ec2logger.get_logger())
302-
ec2ic = boto3.client("ec2-instance-connect", region_name="ap-southeast-1")
303-
response = ec2ic.send_ssh_public_key(
304-
InstanceId=instance.id,
305-
InstanceOSUser="ubuntu",
306-
SSHPublicKey=temp_key.get_pub_key(),
307-
)
308-
assert response["Success"]
309-
310307
# Wait for instance to have public IP
311308
while not instance.public_ip_address:
312309
logger.warning("waiting for ip to be available")
@@ -333,7 +330,7 @@ def gzip_then_base64_encode(s: str) -> str:
333330
attempt = 0
334331
while attempt < max_attempts:
335332
try:
336-
result = run_ssh_command(ssh, "test -f /var/lib/init-complete")
333+
result = run_ssh_command(ssh, "test -f /var/lib/init-complete", timeout=5)
337334
if result['succeeded']:
338335
logger.info("init.sh has completed")
339336
break

0 commit comments

Comments
 (0)