#!/usr/bin/env bash # # Test suite for pg_scribe validation ordering bugs # # This test suite verifies that pg_scribe validates all prerequisites # BEFORE making any state changes, to prevent corrupted/orphaned chains # # Tests cover: # 1. cmd_new_chain with non-existent replication slot # 2. cmd_new_chain --start with non-existent replication slot # 3. cmd_start with metadata/slot name mismatch # 4. cmd_new_chain with missing compression tool # set -euo pipefail # Colors for test output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Test configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" PG_SCRIBE="$SCRIPT_DIR/scripts/pg_scribe" TEST_DIR="/tmp/pg_scribe_validation_test_$$" TEST_DB_PREFIX="pg_scribe_val_$$" PGUSER="${PGUSER:-postgres}" # Test counters TESTS_RUN=0 TESTS_PASSED=0 TESTS_FAILED=0 # Cleanup tracking DATABASES_TO_CLEANUP=() SLOTS_TO_CLEANUP=() # # Logging functions # log_test() { echo -e "${BLUE}TEST:${NC} $*" } log_pass() { echo -e "${GREEN}PASS:${NC} $*" ((TESTS_PASSED++)) } log_fail() { echo -e "${RED}FAIL:${NC} $*" ((TESTS_FAILED++)) } log_info() { echo -e "${YELLOW}INFO:${NC} $*" } # # Helper functions # run_psql() { local dbname="$1" shift psql -U "$PGUSER" -d "$dbname" -tAq "$@" } query_db() { local dbname="$1" local query="$2" run_psql "$dbname" -c "$query" 2>/dev/null || true } create_test_db() { local dbname="$1" log_info "Creating test database: $dbname" # Drop if exists psql -U "$PGUSER" -d postgres -c "DROP DATABASE IF EXISTS $dbname;" &>/dev/null || true # Create database psql -U "$PGUSER" -d postgres -c "CREATE DATABASE $dbname;" &>/dev/null DATABASES_TO_CLEANUP+=("$dbname") } # shellcheck disable=SC2317 # Function called from cleanup trap handler drop_test_db() { local dbname="$1" log_info "Dropping test database: $dbname" # Terminate connections psql -U "$PGUSER" -d postgres -c " SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$dbname' AND pid <> pg_backend_pid(); " &>/dev/null || true # Drop database psql -U "$PGUSER" -d postgres -c "DROP DATABASE IF EXISTS $dbname;" &>/dev/null || true } create_table_with_pk() { local dbname="$1" local table="$2" query_db "$dbname" " CREATE TABLE $table ( id SERIAL PRIMARY KEY, name TEXT, created_at TIMESTAMP DEFAULT now() ); " } # Initialize a backup directory (creates replication slot and initial backups) init_backup_system() { local dbname="$1" local backup_dir="$2" local slot="$3" mkdir -p "$backup_dir" "$PG_SCRIBE" --init -d "$dbname" -f "$backup_dir" -S "$slot" -U "$PGUSER" &>/dev/null SLOTS_TO_CLEANUP+=("$dbname:$slot") } # Check if a chain directory was created chain_dir_exists() { local backup_dir="$1" local chain_count chain_count=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) [[ $chain_count -gt 0 ]] } # # Test cases # test_new_chain_validates_slot_exists() { ((TESTS_RUN++)) log_test "cmd_new_chain should validate slot exists BEFORE creating chain" local dbname="${TEST_DB_PREFIX}_slot1" local backup_dir="$TEST_DIR/slot1" local real_slot="test_slot_real" local fake_slot="nonexistent_slot" # Setup: Initialize with real slot create_test_db "$dbname" create_table_with_pk "$dbname" "users" query_db "$dbname" "INSERT INTO users (name) VALUES ('Alice');" init_backup_system "$dbname" "$backup_dir" "$real_slot" # Count chains before (should be 1 from init) local chains_before chains_before=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) # Try to create new chain with non-existent slot local exit_code=0 local output output=$("$PG_SCRIBE" --new-chain -d "$dbname" -f "$backup_dir" -S "$fake_slot" -U "$PGUSER" 2>&1) || exit_code=$? # Should fail with slot error (exit code 3) if [[ $exit_code -ne 3 ]]; then log_fail "Expected exit code 3 (slot error), got $exit_code" echo "Output: $output" return 1 fi # CRITICAL: Should NOT have created a new chain directory local chains_after chains_after=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) if [[ $chains_after -ne $chains_before ]]; then log_fail "CRITICAL: Created orphaned chain before validating slot!" log_fail "Chains before: $chains_before, after: $chains_after" return 1 fi log_pass "Validates slot exists before creating chain" return 0 } test_new_chain_start_validates_slot_exists() { ((TESTS_RUN++)) log_test "cmd_new_chain --start should validate slot BEFORE creating chain" local dbname="${TEST_DB_PREFIX}_slot2" local backup_dir="$TEST_DIR/slot2" local real_slot="test_slot_real2" local fake_slot="nonexistent_slot2" # Setup: Initialize with real slot create_test_db "$dbname" create_table_with_pk "$dbname" "products" init_backup_system "$dbname" "$backup_dir" "$real_slot" # Count chains before local chains_before chains_before=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) # Try to create new chain with --start and wrong slot # Use timeout to prevent hanging if it tries to exec pg_recvlogical local exit_code=0 local output output=$(timeout 10s "$PG_SCRIBE" --new-chain --start -d "$dbname" -f "$backup_dir" -S "$fake_slot" -U "$PGUSER" 2>&1) || exit_code=$? # Check what happened local chains_after chains_after=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) # If command timed out (exit 124), it means it started pg_recvlogical # which means it created the chain first - THIS IS THE BUG if [[ $exit_code -eq 124 ]]; then log_fail "CRITICAL: Command hung (tried to start pg_recvlogical)" log_fail "This means it created the chain BEFORE validating the slot!" log_fail "Chains before: $chains_before, after: $chains_after" # Kill any lingering pg_recvlogical pkill -f "pg_recvlogical.*$backup_dir" 2>/dev/null || true rm -f "$backup_dir/.pg_scribe.pid" 2>/dev/null || true return 1 fi # Should fail with slot error (exit code 3) if [[ $exit_code -ne 3 ]]; then log_fail "Expected exit code 3 (slot error), got $exit_code" echo "Output: $output" # Check if orphaned chain was created if [[ $chains_after -ne $chains_before ]]; then log_fail "Also created orphaned chain!" fi return 1 fi # CRITICAL: Should NOT have created a new chain directory if [[ $chains_after -ne $chains_before ]]; then log_fail "CRITICAL: Created orphaned chain before validating slot!" log_fail "This is the bug that hit the user!" log_fail "Chains before: $chains_before, after: $chains_after" return 1 fi log_pass "Validates slot exists before creating chain (with --start)" return 0 } test_start_reads_slot_from_metadata() { ((TESTS_RUN++)) log_test "cmd_start should read slot from metadata (not accept -S parameter)" local dbname="${TEST_DB_PREFIX}_meta" local backup_dir="$TEST_DIR/meta" local real_slot="test_slot_meta" # Setup: Initialize with real slot create_test_db "$dbname" create_table_with_pk "$dbname" "orders" init_backup_system "$dbname" "$backup_dir" "$real_slot" # Start should work without -S flag (reads from metadata) # Use timeout in case something goes wrong local exit_code=0 local output output=$(timeout 5s "$PG_SCRIBE" --start -d "$dbname" -f "$backup_dir" -U "$PGUSER" 2>&1) || exit_code=$? # Should have started successfully (exit 124 = timeout = streaming started) if [[ $exit_code -ne 124 ]]; then log_fail "Expected streaming to start (timeout), got exit code $exit_code" echo "Output: $output" return 1 fi # Should have created pidfile if [[ ! -f "$backup_dir/.pg_scribe.pid" ]]; then log_fail "Pidfile not created" return 1 fi # Clean up streaming process local pid pid=$(cat "$backup_dir/.pg_scribe.pid") kill -TERM "$pid" 2>/dev/null || true rm -f "$backup_dir/.pg_scribe.pid" # Verify it logged the correct slot from metadata if ! echo "$output" | grep -q "$real_slot"; then log_fail "Output should show slot from metadata: $real_slot" echo "Output: $output" return 1 fi log_pass "Reads slot from metadata correctly" return 0 } test_new_chain_validates_compression_tool() { ((TESTS_RUN++)) log_test "cmd_new_chain should validate compression tool exists BEFORE backup" local dbname="${TEST_DB_PREFIX}_compress" local backup_dir="$TEST_DIR/compress" local slot="test_slot_compress" # Setup create_test_db "$dbname" create_table_with_pk "$dbname" "data_table" init_backup_system "$dbname" "$backup_dir" "$slot" # Count chains before local chains_before chains_before=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) # Try to use a fake compression method local exit_code=0 local output output=$("$PG_SCRIBE" --new-chain -d "$dbname" -f "$backup_dir" -Z totally_fake_compression -U "$PGUSER" 2>&1) || exit_code=$? # Should fail with validation error (exit code 5) or backup error (exit code 4) # The important thing is it should NOT create a chain directory first if [[ $exit_code -eq 0 ]]; then log_fail "Should have failed with invalid compression method" return 1 fi # CRITICAL: Should NOT have created a new chain directory local chains_after chains_after=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | wc -l) if [[ $chains_after -ne $chains_before ]]; then log_fail "CRITICAL: Created orphaned chain before validating compression!" log_fail "Chains before: $chains_before, after: $chains_after" return 1 fi log_pass "Validates compression method before creating chain" return 0 } test_new_chain_metadata_slot_consistency() { ((TESTS_RUN++)) log_test "cmd_new_chain should preserve slot name in metadata" local dbname="${TEST_DB_PREFIX}_consistency" local backup_dir="$TEST_DIR/consistency" local slot="test_slot_consistency" # Setup create_test_db "$dbname" create_table_with_pk "$dbname" "items" init_backup_system "$dbname" "$backup_dir" "$slot" # Create a new chain (should succeed) sleep 1 # Ensure different timestamp if ! "$PG_SCRIBE" --new-chain -d "$dbname" -f "$backup_dir" -S "$slot" -U "$PGUSER" &>/dev/null; then log_fail "New chain creation failed" return 1 fi # Get latest chain local latest_chain latest_chain=$(find "$backup_dir" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | sort | tail -1) # Verify metadata has correct slot name local metadata_slot metadata_slot=$(grep '"replication_slot"' "$latest_chain/metadata.json" | cut -d'"' -f4) if [[ "$metadata_slot" != "$slot" ]]; then log_fail "Metadata slot mismatch: expected '$slot', got '$metadata_slot'" return 1 fi log_pass "Metadata preserves slot name correctly" return 0 } # # Cleanup # # shellcheck disable=SC2317 # Function called via trap handler cleanup() { log_info "Cleaning up test resources..." # Drop replication slots for entry in "${SLOTS_TO_CLEANUP[@]}"; do local dbname="${entry%%:*}" local slot="${entry#*:}" psql -U "$PGUSER" -d "$dbname" -c " SELECT pg_drop_replication_slot('$slot') FROM pg_replication_slots WHERE slot_name = '$slot'; " &>/dev/null || true done # Stop any lingering streaming processes if [[ -d "$TEST_DIR" ]]; then find "$TEST_DIR" -name '.pg_scribe.pid' 2>/dev/null | while read -r pidfile; do if [[ -f "$pidfile" ]]; then local pid pid=$(cat "$pidfile") kill -TERM "$pid" 2>/dev/null || true fi done fi # Drop databases for dbname in "${DATABASES_TO_CLEANUP[@]}"; do drop_test_db "$dbname" done # Remove test directory if [[ -d "$TEST_DIR" ]]; then rm -rf "$TEST_DIR" fi log_info "Cleanup complete" } # # Main test runner # main() { echo "========================================" echo "pg_scribe Validation Ordering Tests" echo "========================================" echo "" echo "These tests verify that pg_scribe validates" echo "all prerequisites BEFORE making state changes" echo "" # Verify pg_scribe exists if [[ ! -x "$PG_SCRIBE" ]]; then echo "ERROR: pg_scribe not found or not executable: $PG_SCRIBE" exit 1 fi # Verify PostgreSQL is running if ! psql -U "$PGUSER" -d postgres -c "SELECT 1;" &>/dev/null; then echo "ERROR: Cannot connect to PostgreSQL" exit 1 fi # Verify wal_level is logical local wal_level wal_level=$(psql -U "$PGUSER" -d postgres -tAq -c "SHOW wal_level;") if [[ "$wal_level" != "logical" ]]; then echo "ERROR: wal_level must be 'logical', currently: $wal_level" echo "Update ~/.pgenv/pgsql/data/postgresql.conf and restart PostgreSQL" exit 1 fi # Create test directory mkdir -p "$TEST_DIR" # Set up cleanup trap trap cleanup EXIT INT TERM echo "Running tests..." echo "" # Run all tests (use || true to prevent set -e from exiting) test_new_chain_validates_slot_exists || true test_new_chain_start_validates_slot_exists || true test_start_reads_slot_from_metadata || true test_new_chain_validates_compression_tool || true test_new_chain_metadata_slot_consistency || true # Summary echo "" echo "========================================" echo "Test Results" echo "========================================" echo "Tests run: $TESTS_RUN" echo -e "Tests passed: ${GREEN}$TESTS_PASSED${NC}" echo -e "Tests failed: ${RED}$TESTS_FAILED${NC}" echo "" if [[ $TESTS_FAILED -eq 0 ]]; then echo -e "${GREEN}All tests passed!${NC}" exit 0 else echo -e "${RED}Some tests failed!${NC}" echo "" echo "This is EXPECTED before fixing the bugs." echo "These failures demonstrate the validation ordering problems." exit 1 fi } main "$@"