diff --git a/internal/db/database.go b/internal/db/database.go index b7e8dc1..24ed1be 100644 --- a/internal/db/database.go +++ b/internal/db/database.go @@ -4,13 +4,14 @@ import ( "context" "database/sql" _ "embed" // Required for go:embed + "errors" "fmt" "log/slog" "os" "path/filepath" "time" - _ "github.com/mattn/go-sqlite3" // SQLite driver + "github.com/mattn/go-sqlite3" // SQLite driver ) //go:embed schema.sql @@ -23,6 +24,8 @@ type DBConfig struct { MaxIdleConns int // Maximum number of idle connections ConnMaxLifetime time.Duration // Maximum lifetime of connections ConnMaxIdleTime time.Duration // Maximum idle time for connections + MaxRetries int // Maximum number of connection retry attempts + RetryDelay time.Duration // Delay between retry attempts } // DefaultDBConfig returns a DBConfig with sensible defaults for SQLite. @@ -33,17 +36,86 @@ func DefaultDBConfig(dsn string) *DBConfig { MaxIdleConns: 10, // Keep some connections idle for reuse ConnMaxLifetime: 30 * time.Minute, // Rotate connections every 30 minutes ConnMaxIdleTime: 5 * time.Minute, // Close idle connections after 5 minutes + MaxRetries: 3, // Retry connection attempts up to 3 times + RetryDelay: time.Second, // Wait 1 second between retry attempts } } -// NewDB initializes and returns a new database connection pool and runs migrations. -func NewDB(ctx context.Context, logger *slog.Logger, config *DBConfig) (*sql.DB, error) { +// isRetryableError checks if an error is worth retrying. +// This checks SQLite error codes for transient locking/busy conditions. +func isRetryableError(err error) bool { + if err == nil { + return false + } + + // Check if it's a sqlite3.Error + var sqliteErr sqlite3.Error + if errors.As(err, &sqliteErr) { + switch sqliteErr.Code { + case sqlite3.ErrBusy, // SQLITE_BUSY (5) - database is locked by another process + sqlite3.ErrLocked: // SQLITE_LOCKED (6) - database is locked within same connection + return true + } + + // Check extended error codes for more specific busy conditions + switch sqliteErr.ExtendedCode { + case sqlite3.ErrBusyRecovery, // SQLITE_BUSY_RECOVERY (261) - WAL recovery in progress + sqlite3.ErrBusySnapshot: // SQLITE_BUSY_SNAPSHOT (517) - snapshot conflict + return true + } + } + + // No fallback string matching - if it's not a proper SQLite error with + // the right error codes, we don't retry + return false +} + +// retryOperation executes an operation with retry logic for transient SQLite errors. +func retryOperation(ctx context.Context, logger *slog.Logger, config *DBConfig, operationName string, operation func() error) error { + for attempt := 0; attempt <= config.MaxRetries; attempt++ { + if attempt > 0 { + logger.Info("retrying operation", + slog.String("operation", operationName), + slog.Int("attempt", attempt), + slog.Int("max_retries", config.MaxRetries)) + + // Wait before retrying + select { + case <-ctx.Done(): + return fmt.Errorf("context canceled during %s retry: %w", operationName, ctx.Err()) + case <-time.After(config.RetryDelay): + } + } + + err := operation() + if err != nil { + if isRetryableError(err) && attempt < config.MaxRetries { + logger.Warn("retryable error during operation", + slog.String("operation", operationName), + slog.Any("error", err), + slog.Int("attempt", attempt)) + continue + } + return fmt.Errorf("failed to %s: %w", operationName, err) + } + + // Operation successful + return nil + } + + // This should never be reached due to the loop condition, but for safety + return fmt.Errorf("unexpected retry loop exit for %s", operationName) +} + +// openAndConfigureDB opens the database connection and configures the connection pool. +func openAndConfigureDB(config *DBConfig) (*sql.DB, error) { // Ensure the directory for the SQLite file exists dbDir := filepath.Dir(config.DSN) if err := os.MkdirAll(dbDir, 0755); err != nil { return nil, fmt.Errorf("failed to create database directory %s: %w", dbDir, err) } + // Open database connection (no retry needed - sql.Open rarely fails with transient errors) db, err := sql.Open("sqlite3", config.DSN+"?_foreign_keys=on") // Enable foreign key constraints if err != nil { return nil, fmt.Errorf("failed to open database: %w", err) @@ -55,9 +127,23 @@ func NewDB(ctx context.Context, logger *slog.Logger, config *DBConfig) (*sql.DB, db.SetConnMaxLifetime(config.ConnMaxLifetime) db.SetConnMaxIdleTime(config.ConnMaxIdleTime) - if err = db.PingContext(ctx); err != nil { + return db, nil +} + +// NewDB initializes and returns a new database connection pool and runs migrations. +func NewDB(ctx context.Context, logger *slog.Logger, config *DBConfig) (*sql.DB, error) { + db, err := openAndConfigureDB(config) + if err != nil { + return nil, err + } + + // Test the actual connection with retry logic + err = retryOperation(ctx, logger, config, "database connection", func() error { + return db.PingContext(ctx) + }) + if err != nil { db.Close() // Clean up connection on failure - return nil, fmt.Errorf("failed to ping database: %w", err) + return nil, err } logger.Info("database connection established", @@ -65,12 +151,16 @@ func NewDB(ctx context.Context, logger *slog.Logger, config *DBConfig) (*sql.DB, slog.Int("max_open_conns", config.MaxOpenConns), slog.Int("max_idle_conns", config.MaxIdleConns)) - // Execute schema. - if _, err := db.ExecContext(ctx, ddl); err != nil { + // Execute schema with retry logic + err = retryOperation(ctx, logger, config, "schema execution", func() error { + _, err := db.ExecContext(ctx, ddl) + return err + }) + if err != nil { db.Close() // Clean up connection on failure - return nil, fmt.Errorf("failed to execute DDL: %w", err) + return nil, err } - logger.Info("database schema applied") + logger.Info("database schema applied") return db, nil }