From 101ffe19649530fd6c1597b99feaefb9a3d2bcef Mon Sep 17 00:00:00 2001 From: notplants <@notplants> Date: Tue, 16 Jun 2026 17:00:16 +0000 Subject: [PATCH] fix(db): make pg_upgrade migration idempotent & crash-safe The postgres major-version migration in the db entrypoint was not safe to re-run. If the container was killed mid-migration it could crash-loop forever ("mkdir: cannot create directory .../old_data: File exists") or silently initdb a fresh empty cluster over the live data once PG_VERSION had been moved out of $PGDATA but before the in-progress marker was written. Replace the marker file with a state-driven guard keyed on the scratch dirs: empty old_data/new_data means the run was interrupted before any data moved, so discard and retry (idempotent); non-empty means data may only live there, so stop for manual recovery. Bump DB_ENTRYPOINT_VERSION v1->v2 so swarm picks up the new (immutable) config. --- abra.sh | 2 +- entrypoint.postgres.sh.tmpl | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/abra.sh b/abra.sh index 8cac4ed..bb432ff 100644 --- a/abra.sh +++ b/abra.sh @@ -1,2 +1,2 @@ -export DB_ENTRYPOINT_VERSION=v1 +export DB_ENTRYPOINT_VERSION=v2 export PG_BACKUP_VERSION=v2 diff --git a/entrypoint.postgres.sh.tmpl b/entrypoint.postgres.sh.tmpl index fc69a6e..8174cda 100644 --- a/entrypoint.postgres.sh.tmpl +++ b/entrypoint.postgres.sh.tmpl @@ -2,16 +2,23 @@ set -e -MIGRATION_MARKER=$PGDATA/migration_in_progress OLDDATA=$PGDATA/old_data NEWDATA=$PGDATA/new_data echo "Running as $(id)" -if [ -e $MIGRATION_MARKER ]; then - echo "FATAL: migration was started but did not complete in a previous run. manual recovery necessary" - exit 1 -fi +# The migration uses $OLDDATA/$NEWDATA as scratch and removes them when it +# finishes; a leftover *empty* one means a run was interrupted before any data +# moved (data still intact at $PGDATA) so we clear it and retry, while a +# *non-empty* one means data may live only there, so we stop for manual recovery. +for scratch in $OLDDATA $NEWDATA; do + if [ -d "$scratch" ] && [ -n "$(ls -A "$scratch")" ]; then + echo "FATAL: $scratch exists and is not empty - a previous migration did not" + echo "complete and the data may only exist there. manual recovery necessary." + exit 1 + fi +done +rm -rf $OLDDATA $NEWDATA if [ -f $PGDATA/PG_VERSION ]; then DATA_VERSION=$(cat $PGDATA/PG_VERSION) @@ -27,7 +34,6 @@ if [ -f $PGDATA/PG_VERSION ]; then gosu postgres mkdir $OLDDATA $NEWDATA chmod 700 $OLDDATA $NEWDATA mv $PGDATA/* $OLDDATA/ || true - touch $MIGRATION_MARKER echo "running initdb" # abuse entrypoint script for initdb by making server error out gosu postgres bash -c "export PGDATA=$NEWDATA ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true" @@ -38,7 +44,6 @@ if [ -f $PGDATA/PG_VERSION ]; then mv $NEWDATA/* $PGDATA rm -rf $OLDDATA rmdir $NEWDATA - rm $MIGRATION_MARKER echo "migration complete" fi fi