fix(db): bump DB_ENTRYPOINT_VERSION to v3 so the entrypoint config reloads

The install-user fix changed the entrypoint content; swarm configs are immutable, so the config name (which embeds DB_ENTRYPOINT_VERSION) must change for a redeploy to pick up the new script.
fix(db): run pg_upgrade as the old cluster's real install user
2026-06-16 18:04:05 +00:00 · 2026-06-16 17:59:26 +00:00 · 2026-06-16 17:00:16 +00:00 · 2026-06-15 17:37:14 +00:00 · 2026-06-11 22:52:37 +00:00 · 2026-06-05 02:03:34 +00:00
6 changed files with 113 additions and 26 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -18,6 +18,23 @@ steps:
      STACK_NAME: discourse
      LETS_ENCRYPT_ENV: production
      SECRET_DB_PASSWORD_VERSION: v1
+      DB_ENTRYPOINT_VERSION: v1
 trigger:
  branch:
    - main
+---
+kind: pipeline
+name: generate recipe catalogue
+steps:
+  - name: release a new version
+    image: plugins/downstream
+    settings:
+      server: https://build.coopcloud.tech
+      token:
+        from_secret: drone_abra-bot_token
+      fork: true
+      repositories:
+        - toolshed/auto-recipes-catalogue-json
+
+trigger:
+  event: tag
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@ A platform for community discussion
 <!-- metadata -->
 * **Category**: Apps
 * **Status**: 
-* **Image**: [`bitnami/discourse`](https://hub.docker.com/r/bitname/discourse)
+* **Image**: [`bitnami/discourse`](https://hub.docker.com/r/bitnami/discourse)
 * **Healthcheck**: yes
 * **Backups**: no
 * **Email**: yes
--- a/abra.sh
+++ b/abra.sh
@ -1 +1,2 @@
-export DB_ENTRYPOINT_VERSION=v1
+export DB_ENTRYPOINT_VERSION=v3
+export PG_BACKUP_VERSION=v2
--- a/compose.yml
+++ b/compose.yml
@ -3,7 +3,7 @@ version: "3.8"

 services:
  app:
-    image: bitnami/discourse:3.1.1
+    image: bitnamilegacy/discourse:3.5.0
    networks:
      - proxy
      - internal
@ -43,16 +43,16 @@ services:
        #- "traefik.http.routers.${STACK_NAME}.middlewares=${STACK_NAME}-redirect"
        #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLForceHost=true"
        #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLHost=${DOMAIN}"
-        - "coop-cloud.${STACK_NAME}.version=0.6.0+3.1.1"
-    # healthcheck:
-    #   test: ["CMD", "curl", "-f", "http://localhost:3000"]
-    #   interval: 30s
-    #   timeout: 10s
-    #   retries: 10
-    #   start_period: 1m
+        - "coop-cloud.${STACK_NAME}.version=0.10.0+3.5.0"
+    healthcheck:
+      test: "ruby -e \"require 'uri'; require 'net/http'; uri = URI('http://localhost:3000/srv/status'); res = Net::HTTP.get_response(uri); if res.is_a?(Net::HTTPSuccess) then exit (0) else exit (1) end\""
+      interval: 30s
+      timeout: 10s
+      retries: 6
+      start_period: 20m

  db:
-    image: postgres:13
+    image: pgvector/pgvector:pg17
    networks:
      - internal
    secrets:
@ -63,6 +63,9 @@ services:
      - source: db_entrypoint
        target: /docker-entrypoint.sh
        mode: 0555
+      - source: pg_backup
+        target: /pg_backup.sh
+        mode: 0555
    entrypoint: /docker-entrypoint.sh
    environment:
      - POSTGRES_HOST_AUTH_METHOD=trust
@ -72,20 +75,21 @@ services:
    deploy:
      labels:
        backupbot.backup: "true"
-        backupbot.backup.pre-hook: "bash -c 'PGPASSWORD=$$(cat $${POSTGRES_PASSWORD_FILE}) pg_dump -U $${POSTGRES_USER} $${POSTGRES_DB} > /tmp/backup.sql'"
-        backupbot.backup.post-hook: "rm -rf /tmp/backup.sql"
-        backupbot.backup.path: "/tmp/backup.sql"
+        backupbot.backup.pre-hook: "/pg_backup.sh backup"
+        backupbot.backup.volumes.postgresql_data.path: "backup.sql"
+        backupbot.restore.post-hook: "/pg_backup.sh restore"

  redis:
-    image: redis:7.2-alpine
+    image: redis:8.8-alpine
    networks:
      - internal
    volumes:
      - 'redis_data:/data'

  sidekiq:
-    image: bitnami/discourse:3.1.1
+    image: bitnamilegacy/discourse:3.5.0
    networks:
+      - proxy
      - internal
    depends_on:
      - discourse
@ -131,3 +135,6 @@ configs:
    name: ${STACK_NAME}_db_entrypoint_${DB_ENTRYPOINT_VERSION}
    file: entrypoint.postgres.sh.tmpl
    template_driver: golang
+  pg_backup:
+    name: ${STACK_NAME}_pg_backup_${PG_BACKUP_VERSION}
+    file: pg_backup.sh
--- a/entrypoint.postgres.sh.tmpl
+++ b/entrypoint.postgres.sh.tmpl
@ -2,16 +2,23 @@

 set -e

-MIGRATION_MARKER=$PGDATA/migration_in_progress
 OLDDATA=$PGDATA/old_data
 NEWDATA=$PGDATA/new_data

 echo "Running as $(id)"

-if [ -e $MIGRATION_MARKER ]; then
-  echo "FATAL: migration was started but did not complete in a previous run. manual recovery necessary"
-  exit 1
-fi
+# The migration uses $OLDDATA/$NEWDATA as scratch and removes them when it
+# finishes; a leftover *empty* one means a run was interrupted before any data
+# moved (data still intact at $PGDATA) so we clear it and retry, while a
+# *non-empty* one means data may live only there, so we stop for manual recovery.
+for scratch in $OLDDATA $NEWDATA; do
+  if [ -d "$scratch" ] && [ -n "$(ls -A "$scratch")" ]; then
+    echo "FATAL: $scratch exists and is not empty - a previous migration did not"
+    echo "complete and the data may only exist there. manual recovery necessary."
+    exit 1
+  fi
+done
+rm -rf $OLDDATA $NEWDATA

 if [ -f $PGDATA/PG_VERSION ]; then
  DATA_VERSION=$(cat $PGDATA/PG_VERSION)
@ -23,22 +30,33 @@ if [ -f $PGDATA/PG_VERSION ]; then
    apt-get update && apt-get install -y --no-install-recommends \
      postgresql-$DATA_VERSION \
      && rm -rf /var/lib/apt/lists/*
+    # pg_upgrade must run as the old cluster's bootstrap superuser (the "install
+    # user", oid 10), and the new cluster must be initialised with that same
+    # user. It is not necessarily $POSTGRES_USER (e.g. clusters created with the
+    # default "postgres" superuser and a separate app role), so read it from the
+    # old cluster: briefly start it and ask, connecting as the app role we know.
+    PGBIN=/usr/lib/postgresql/$DATA_VERSION/bin
+    gosu postgres $PGBIN/pg_ctl -D $PGDATA -w \
+      -o "-c listen_addresses= -c unix_socket_directories=/tmp" start
+    INSTALL_USER=$(gosu postgres psql -h /tmp -U "$POSTGRES_USER" -d postgres -tAc \
+      "select rolname from pg_roles where oid = 10")
+    gosu postgres $PGBIN/pg_ctl -D $PGDATA -w stop
+    echo "old cluster install user: $INSTALL_USER"
    echo "shuffling around"
    gosu postgres mkdir $OLDDATA $NEWDATA
    chmod 700 $OLDDATA $NEWDATA
    mv $PGDATA/* $OLDDATA/ || true
-    touch $MIGRATION_MARKER
    echo "running initdb"
-    # abuse entrypoint script for initdb by making server error out
-    gosu postgres bash -c "export PGDATA=$NEWDATA ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true"
+    # abuse entrypoint script for initdb by making server error out; initialise
+    # the new cluster with the same superuser as the old one so pg_upgrade matches
+    gosu postgres bash -c "export PGDATA=$NEWDATA POSTGRES_USER=$INSTALL_USER ; /usr/local/bin/docker-entrypoint.sh --invalid-arg || true"
    echo "running pg_upgrade"
    cd /tmp
-    gosu postgres pg_upgrade --link -b /usr/lib/postgresql/$DATA_VERSION/bin -d $OLDDATA -D $NEWDATA -U $POSTGRES_USER
+    gosu postgres pg_upgrade --link -b /usr/lib/postgresql/$DATA_VERSION/bin -d $OLDDATA -D $NEWDATA -U $INSTALL_USER
    cp $OLDDATA/pg_hba.conf $NEWDATA/
    mv $NEWDATA/* $PGDATA
    rm -rf $OLDDATA
    rmdir $NEWDATA
-    rm $MIGRATION_MARKER
    echo "migration complete"
  fi
 fi
--- a/pg_backup.sh
+++ b/pg_backup.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Postgres backup/restore hook for the discourse `db` service.
+
+set -e
+
+BACKUP_FILE='/var/lib/postgresql/data/backup.sql'
+export PGPASSWORD=$(cat "${POSTGRES_PASSWORD_FILE:-/run/secrets/db_password}")
+DB_USER="${POSTGRES_USER:-discourse}"
+DB_NAME="${POSTGRES_DB:-discourse}"
+
+function backup {
+  pg_dump -U "$DB_USER" "$DB_NAME" | gzip > "$BACKUP_FILE"
+}
+
+function restore {
+  cd /var/lib/postgresql/data/
+
+  # Block all non-local connections so the running discourse app + sidekiq cannot reconnect and
+  # interfere with the drop/recreate/reimport. Restored on exit.
+  restore_hba() {
+    cat pg_hba.conf.bak > pg_hba.conf
+    rm -f pg_hba.conf.bak
+    su postgres -c 'pg_ctl reload'
+  }
+  cp pg_hba.conf pg_hba.conf.bak
+  echo 'local all all trust' > pg_hba.conf
+  su postgres -c 'pg_ctl reload'
+  trap restore_hba EXIT INT TERM
+
+  # terminate any lingering local sessions before recreate
+  # see https://stackoverflow.com/questions/5108876/kill-a-postgresql-session-connection
+  psql -U "$DB_USER" -d postgres -c \
+    "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='${DB_NAME}' AND pid<>pg_backend_pid();"
+
+  # drop database and then recreate it
+  psql -U "$DB_USER" -d postgres -c "DROP DATABASE ${DB_NAME} WITH (FORCE);"
+  createdb -U "$DB_USER" "$DB_NAME"
+
+  # reimport data 
+  gunzip -c "$BACKUP_FILE" | psql -U "$DB_USER" -d "$DB_NAME" -1 -v ON_ERROR_STOP=1 -f -
+}
+
+$@
Author	SHA1	Message	Date
notplants	bd5f181737	fix(db): bump DB_ENTRYPOINT_VERSION to v3 so the entrypoint config reloads The install-user fix changed the entrypoint content; swarm configs are immutable, so the config name (which embeds DB_ENTRYPOINT_VERSION) must change for a redeploy to pick up the new script.	2026-06-16 18:04:05 +00:00
notplants	57f5ee2531	fix(db): run pg_upgrade as the old cluster's real install user pg_upgrade must run as the old cluster's bootstrap superuser (oid 10), and the new cluster must be initialised with that same user, otherwise it fails the "database user is the install user" consistency check. The install user is not necessarily $POSTGRES_USER: clusters created with the default "postgres" superuser plus a separate app role (e.g. discourse) are common. Detect it from the old cluster by briefly starting it and reading pg_roles (oid = 10) as the known app role, then use it for both the new cluster's initdb and the pg_upgrade -U argument.	2026-06-16 17:59:26 +00:00
notplants	101ffe1964	fix(db): make pg_upgrade migration idempotent & crash-safe The postgres major-version migration in the db entrypoint was not safe to re-run. If the container was killed mid-migration it could crash-loop forever ("mkdir: cannot create directory .../old_data: File exists") or silently initdb a fresh empty cluster over the live data once PG_VERSION had been moved out of $PGDATA but before the in-progress marker was written. Replace the marker file with a state-driven guard keyed on the scratch dirs: empty old_data/new_data means the run was interrupted before any data moved, so discard and retry (idempotent); non-empty means data may only live there, so stop for manual recovery. Bump DB_ENTRYPOINT_VERSION v1->v2 so swarm picks up the new (immutable) config.	2026-06-16 17:00:16 +00:00
notplants	433ce12dbc	Merge pull request 'chore: upgrade to 0.10.0+3.5.0' (#2 ) from upgrade-0.8.0+3.5.0 into main Reviewed-on: https://git.autonomic.zone/recipe-maintainers/discourse/pulls/2	2026-06-15 17:37:14 +00:00
autonomic-bot	b7d8a244d7	chore: upgrade to 0.10.0+3.5.0 (redis 8.0->8.8-alpine)	2026-06-11 22:52:37 +00:00
autonomic-bot	7ae7b0f76e	chore: upgrade to 0.9.0+3.5.0	2026-06-05 02:03:34 +00:00
notplants	b0f9ae743a	fix(db): switch postgres image to pgvector/pgvector:pg17 + bump PG_BACKUP_VERSION Some checks failed continuous-integration/drone/pr Build is failing Details continuous-integration/drone/tag Build is passing Details	2026-06-02 20:07:06 +00:00
notplants	5091fd999e	improved comments Some checks failed continuous-integration/drone/pr Build is failing Details	2026-06-02 19:10:27 +00:00
notplants	ec7bbdf786	fix(backup): add pg_backup.sh + proper backup/restore hooks, 20m start_period	2026-06-02 19:10:27 +00:00
notplants	0f873433ba	chore: upgrade to 0.8.0+3.5.0	2026-06-02 19:10:27 +00:00
decentral1se	7d53d4ec39	Merge pull request 'Update README.md corrected url to bitnami/discourse' (#12 ) from jeppebundsgaard/discourse:main into main Some checks failed continuous-integration/drone/push Build is failing Details Reviewed-on: #12 Reviewed-by: decentral1se <decentral1se@noreply.git.coopcloud.tech>	2025-12-28 09:32:18 +00:00
jeppebundsgaard	ee2381c3b7	Update README.md Some checks failed continuous-integration/drone/pr Build is failing Details	2025-12-28 02:04:01 +00:00
Cassowary	63d3801060	Update .drone.yml All checks were successful continuous-integration/drone/push Build is passing Details	2025-01-08 10:09:12 -08:00
3wc	fee61883ed	Fix CI, add auto recipe catalogue generation All checks were successful continuous-integration/drone/push Build is passing Details	2024-10-02 15:44:50 -04:00
3wc	eb96de947b	chore: publish 0.7.0+3.3.1 release Some checks failed continuous-integration/drone/push Build is failing Details	2024-10-02 15:42:31 -04:00
knoflook	ddda5da6bc	chore: publish new release Some checks failed continuous-integration/drone/push Build is failing Details	2023-10-20 15:54:13 +02:00
3wc	304468b8f4	chore: publish 0.6.2+3.1.1 release Some checks failed continuous-integration/drone/push Build is failing Details	2023-10-19 11:04:19 +01:00
3wc	0ccf1d7a6c	Fix healthcheck, sidekiq on proxy network Some checks failed continuous-integration/drone/push Build is failing Details	2023-10-19 11:03:42 +01:00
3wc	1049c27c35	chore: publish 0.6.1+3.1.1 release Some checks failed continuous-integration/drone/push Build is failing Details	2023-10-08 18:41:20 +01:00
3wc	03dc80d073	Add healthcheck for app container Some checks failed continuous-integration/drone/push Build is failing Details	2023-10-08 18:40:49 +01:00