3 # pg_scribe - Incremental SQL backup system for PostgreSQL
5 # This script provides a unified CLI for managing PostgreSQL backups
6 # using logical replication and plain SQL format.
16 EXIT_CONNECTION_ERROR=2
19 EXIT_VALIDATION_ERROR=5
23 DEFAULT_SLOT="pg_scribe"
25 DEFAULT_HOST="localhost"
26 DEFAULT_STATUS_INTERVAL=10
27 DEFAULT_FSYNC_INTERVAL=10
32 HOST="${PGHOST:-$DEFAULT_HOST}"
33 PORT="${PGPORT:-$DEFAULT_PORT}"
34 USERNAME="${PGUSER:-${USER:-}}"
37 STATUS_INTERVAL="$DEFAULT_STATUS_INTERVAL"
38 FSYNC_INTERVAL="$DEFAULT_FSYNC_INTERVAL"
51 # Color output support
52 if [[ "${PG_COLOR:-auto}" == "always" ]] || [[ "${PG_COLOR:-auto}" == "auto" && -t 2 ]]; then
68 # Logging functions (output to stderr)
70 echo -e "${BLUE}INFO:${RESET} $*" >&2
74 echo -e "${GREEN}SUCCESS:${RESET} $*" >&2
78 echo -e "${YELLOW}WARNING:${RESET} $*" >&2
82 echo -e "${RED}ERROR:${RESET} $*" >&2
86 echo -e "${BOLD}==>${RESET} $*" >&2
92 pg_scribe - Incremental SQL backup system for PostgreSQL
95 pg_scribe --init [OPTIONS]
96 pg_scribe --start [OPTIONS]
97 pg_scribe --rotate-diff [OPTIONS]
98 pg_scribe --new-chain [OPTIONS]
99 pg_scribe --restore [OPTIONS]
100 pg_scribe --status [OPTIONS]
104 Actions (exactly one required):
105 --init Initialize backup system (create first chain)
106 --start Start streaming incremental backups
107 --stop Stop active streaming process
108 --rotate-diff Rotate differential file within active chain
109 --new-chain Create new chain with fresh base backup
110 --restore Restore from backups
111 --status Check replication slot and chain inventory
112 -V, --version Print version and exit
113 -?, --help Show this help and exit
116 -d, --dbname=DBNAME Database name (can be connection string)
117 -h, --host=HOSTNAME Database server host (default: $DEFAULT_HOST)
118 -p, --port=PORT Database server port (default: $DEFAULT_PORT)
119 -U, --username=NAME Database user (default: \$PGUSER or \$USER)
120 -w, --no-password Never prompt for password
121 -W, --password Force password prompt
124 -v, --verbose Enable verbose mode
127 -f, --backup-dir=DIR Backup output directory (required)
128 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
129 --if-not-exists Do not error if backup directory already initialized
130 --force Skip validation and force initialization
133 -f, --backup-dir=DIR Backup directory containing chains (required)
134 -s, --status-interval=SECS Status update interval (default: $DEFAULT_STATUS_INTERVAL)
135 -F, --fsync-interval=SECS Fsync interval (default: $DEFAULT_FSYNC_INTERVAL, 0 to disable)
136 Note: Replication slot is read from chain metadata
139 -f, --backup-dir=DIR Backup directory containing active process (required)
141 Options for --rotate-diff:
142 -f, --backup-dir=DIR Backup directory containing active chain (required)
144 Options for --new-chain:
145 -d, --dbname=DBNAME Database name (required)
146 -f, --backup-dir=DIR Backup directory for new chain (required)
147 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
148 -Z, --compress=METHOD Compression: gzip, lz4, zstd, or none (default: none)
149 --start Stop old streaming and start streaming to new chain
151 Options for --restore:
152 -f, --backup-dir=DIR Backup directory containing chains (required)
153 -d, --dbname=DBNAME Target database name (required)
154 -C, --create Create target database
155 --base-backup=ID Specific chain ID to restore (default: latest)
156 --include-active Include active.sql (risky - may have incomplete data)
157 --no-sync-sequences Skip sequence synchronization
159 Options for --status:
160 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
161 -f, --backup-dir=DIR Backup directory to analyze (optional)
166 2 Database connection error
167 3 Replication slot error
168 4 Backup/restore error
169 5 Invalid arguments or validation failure
171 Environment Variables:
172 PGHOST, PGPORT, PGDATABASE, PGUSER, PGPASSWORD, PG_COLOR
174 Report bugs to: https://github.com/your-repo/pg_scribe/issues
178 # Parse command line arguments
180 if [[ $# -eq 0 ]]; then
182 exit "$EXIT_VALIDATION_ERROR"
185 while [[ $# -gt 0 ]]; do
188 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
193 # Can be either an action or a modifier for --new-chain
194 if [[ -z "$ACTION" ]]; then
196 elif [[ "$ACTION" == "new-chain" ]]; then
199 log_error "Multiple action flags specified"
200 exit "$EXIT_VALIDATION_ERROR"
205 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
210 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
215 if [[ -z "$ACTION" ]]; then
217 elif [[ "$ACTION" == "start" ]]; then
221 log_error "Multiple action flags specified"
222 exit "$EXIT_VALIDATION_ERROR"
227 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
232 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
237 echo "pg_scribe $VERSION"
276 -f|--file|--backup-dir)
280 --file=*|--backup-dir=*)
292 -s|--status-interval)
297 STATUS_INTERVAL="${1#*=}"
305 FSYNC_INTERVAL="${1#*=}"
325 BASE_BACKUP="${1#*=}"
357 log_error "Unknown option: $1"
359 exit "$EXIT_VALIDATION_ERROR"
364 # Validate action was specified
365 if [[ -z "$ACTION" ]]; then
366 log_error "No action specified"
368 exit "$EXIT_VALIDATION_ERROR"
371 # Use PGDATABASE if dbname not specified
372 if [[ -z "$DBNAME" && -n "${PGDATABASE:-}" ]]; then
377 # Build psql connection string
381 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
382 [[ -n "$HOST" ]] && args+=(-h "$HOST")
383 [[ -n "$PORT" ]] && args+=(-p "$PORT")
384 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
385 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
386 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
388 printf '%s\n' "${args[@]}"
391 # Build pg_recvlogical connection string
392 build_pg_recvlogical_args() {
395 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
396 [[ -n "$HOST" ]] && args+=(-h "$HOST")
397 [[ -n "$PORT" ]] && args+=(-p "$PORT")
398 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
399 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
400 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
402 printf '%s\n' "${args[@]}"
405 # Build pg_dumpall connection arguments (no -d flag)
406 build_pg_dumpall_args() {
409 [[ -n "$HOST" ]] && args+=(-h "$HOST")
410 [[ -n "$PORT" ]] && args+=(-p "$PORT")
411 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
412 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
413 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
415 printf '%s\n' "${args[@]}"
418 # Generate chain ID in ISO 8601 format (UTC, sortable)
420 date -u +%Y%m%dT%H%M%SZ
423 # Get human-readable file size
427 # Echoes the file size in human-readable format (e.g., "1.2M", "5.4K")
430 du -h "$file_path" 2>/dev/null | cut -f1
433 # Test database connection
435 log_step "Testing database connection..."
438 mapfile -t psql_args < <(build_psql_args)
440 if ! psql "${psql_args[@]}" -c "SELECT version();" >/dev/null 2>&1; then
441 log_error "Failed to connect to database"
442 log_error "Connection details: host=$HOST port=$PORT dbname=$DBNAME user=$USERNAME"
443 exit "$EXIT_CONNECTION_ERROR"
446 if [[ "$VERBOSE" -eq 1 ]]; then
447 log_success "Connected to database"
451 # Execute SQL query and return result
455 mapfile -t psql_args < <(build_psql_args)
456 psql "${psql_args[@]}" -t -A -c "$sql" 2>&1
459 # Execute SQL query silently (return exit code only)
463 mapfile -t psql_args < <(build_psql_args)
464 psql "${psql_args[@]}" -t -A -c "$sql" >/dev/null 2>&1
467 # Take a globals backup (roles, tablespaces, etc.)
469 # $1 - chain directory path
471 # Echoes the path to the created globals backup file
472 # Exits script on failure
473 take_globals_backup() {
475 local globals_backup_file="$chain_dir/globals.sql"
477 log_info "Taking globals backup..."
479 # Build pg_dumpall connection arguments
481 mapfile -t dumpall_args < <(build_pg_dumpall_args)
483 # Add globals-only flag and output file
484 dumpall_args+=(--globals-only)
485 dumpall_args+=(--file="$globals_backup_file")
487 if pg_dumpall "${dumpall_args[@]}"; then
489 globals_size=$(get_file_size "$globals_backup_file")
490 log_success "Globals backup completed ($globals_size)"
491 echo "$globals_backup_file"
493 log_error "Globals backup failed"
494 # Clean up partial file
495 rm -f "$globals_backup_file" 2>/dev/null || true
496 exit "$EXIT_BACKUP_ERROR"
500 # Validate required arguments for a command
501 # Arguments: command_name arg_name:description [arg_name:description ...]
502 # Example: validate_required_args "init" "DBNAME:database" "BACKUP_DIR:backup directory"
503 validate_required_args() {
504 local command_name="$1"
507 local validation_failed=0
509 for arg_spec in "$@"; do
510 local arg_name="${arg_spec%%:*}"
511 local arg_description="${arg_spec#*:}"
513 # Use indirect variable reference to check if argument is set
514 if [[ -z "${!arg_name}" ]]; then
515 log_error "--${command_name} requires ${arg_description}"
520 if [[ "$validation_failed" -eq 1 ]]; then
521 exit "$EXIT_VALIDATION_ERROR"
525 # Check replication slot existence
528 # $2 - should_exist: 1 if slot should exist, 0 if slot should NOT exist
529 # Exits with appropriate error code if expectation is not met
530 check_replication_slot() {
532 local should_exist="$2"
535 slot_exists=$(query_db "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$slot_name';")
537 if [[ "$should_exist" -eq 0 ]]; then
538 # Slot should NOT exist
539 if [[ "$slot_exists" -gt 0 ]]; then
540 log_error "Replication slot '$slot_name' already exists"
542 log_error "A replication slot with this name already exists in the database."
543 log_error "This may indicate:"
544 log_error " - A previous initialization that was not cleaned up"
545 log_error " - Another pg_scribe instance using the same slot name"
547 log_error "To resolve:"
548 log_error " - Use a different slot name with -S/--slot option"
549 log_error " - Or drop the existing slot (if safe):"
550 log_error " psql -d $DBNAME -c \"SELECT pg_drop_replication_slot('$slot_name');\""
551 exit "$EXIT_SLOT_ERROR"
555 if [[ "$slot_exists" -eq 0 ]]; then
556 log_error "Replication slot '$slot_name' does not exist"
558 log_error "You must initialize the backup system first:"
559 log_error " pg_scribe --init -d $DBNAME -f <backup_dir> -S $slot_name"
561 log_error "Or verify the slot name is correct with:"
562 log_error " psql -d $DBNAME -c \"SELECT slot_name FROM pg_replication_slots;\""
563 exit "$EXIT_SLOT_ERROR"
565 log_success "Replication slot '$slot_name' found"
570 # --init command implementation
573 log_step "Initializing pg_scribe backup system"
575 # Validate required arguments
576 validate_required_args "init" "DBNAME:-d/--dbname" "BACKUP_DIR:-f/--file (backup directory)"
578 # Cleanup tracking for failure handling
579 local CREATED_SLOT=""
580 local CREATED_FILES=()
582 # Cleanup function for handling failures
583 # shellcheck disable=SC2317 # Function called via trap handler
584 cleanup_on_failure() {
587 # Only cleanup on actual failure, not on successful exit
588 if [[ $exit_code -ne 0 && $exit_code -ne $EXIT_WARNING ]]; then
589 log_info "Cleaning up after failed initialization..."
591 # Drop replication slot if we created it
592 if [[ -n "$CREATED_SLOT" ]]; then
593 log_info "Dropping replication slot '$CREATED_SLOT'..."
594 query_db "SELECT pg_drop_replication_slot('$CREATED_SLOT');" 2>/dev/null || true
597 # Remove files we created
598 for file in "${CREATED_FILES[@]}"; do
599 if [[ -f "$file" ]]; then
600 log_info "Removing partial file: $file"
601 rm -f "$file" 2>/dev/null || true
605 log_info "Cleanup complete"
609 # Set up cleanup trap
610 trap cleanup_on_failure EXIT INT TERM
612 # Test connection first
615 # Phase 1: Validation
616 log_step "Phase 1: Validation"
618 local validation_failed=0
622 log_info "Checking wal_level configuration..."
624 wal_level=$(query_db "SHOW wal_level;")
625 if [[ "$wal_level" != "logical" ]]; then
626 log_error "CRITICAL: wal_level is '$wal_level', must be 'logical'"
627 log_error " Fix: Add 'wal_level = logical' to postgresql.conf and restart PostgreSQL"
630 if [[ "$VERBOSE" -eq 1 ]]; then
631 log_success "wal_level = logical"
635 # Check max_replication_slots
636 log_info "Checking max_replication_slots configuration..."
638 max_slots=$(query_db "SHOW max_replication_slots;")
639 if [[ "$max_slots" -lt 1 ]]; then
640 log_error "CRITICAL: max_replication_slots is $max_slots, must be >= 1"
641 log_error " Fix: Add 'max_replication_slots = 10' to postgresql.conf and restart PostgreSQL"
644 if [[ "$VERBOSE" -eq 1 ]]; then
645 log_success "max_replication_slots = $max_slots"
649 # Check max_wal_senders
650 log_info "Checking max_wal_senders configuration..."
652 max_senders=$(query_db "SHOW max_wal_senders;")
653 if [[ "$max_senders" -lt 1 ]]; then
654 log_error "CRITICAL: max_wal_senders is $max_senders, must be >= 1"
655 log_error " Fix: Add 'max_wal_senders = 10' to postgresql.conf and restart PostgreSQL"
658 if [[ "$VERBOSE" -eq 1 ]]; then
659 log_success "max_wal_senders = $max_senders"
663 # Check replica identity on all tables
664 log_info "Checking replica identity for all tables..."
666 bad_tables=$(query_db "
667 SELECT n.nspname || '.' || c.relname
669 JOIN pg_namespace n ON n.oid = c.relnamespace
670 WHERE c.relkind = 'r'
671 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
672 AND c.relreplident IN ('d', 'n')
674 SELECT 1 FROM pg_index i
675 WHERE i.indrelid = c.oid AND i.indisprimary
677 ORDER BY n.nspname, c.relname;
680 if [[ -n "$bad_tables" ]]; then
681 log_error "CRITICAL: The following tables lack adequate replica identity:"
682 while IFS= read -r table; do
683 log_error " - $table"
684 done <<< "$bad_tables"
685 log_error " Fix: Add a primary key or set replica identity:"
686 log_error " ALTER TABLE <table> ADD PRIMARY KEY (id);"
687 log_error " -- OR --"
688 log_error " ALTER TABLE <table> REPLICA IDENTITY FULL;"
691 if [[ "$VERBOSE" -eq 1 ]]; then
692 log_success "All tables have adequate replica identity"
696 # Warning: Check for unlogged tables
697 log_info "Checking for unlogged tables..."
698 local unlogged_tables
699 unlogged_tables=$(query_db "
700 SELECT n.nspname || '.' || c.relname
702 JOIN pg_namespace n ON n.oid = c.relnamespace
703 WHERE c.relkind = 'r'
704 AND c.relpersistence = 'u'
705 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
706 ORDER BY n.nspname, c.relname;
709 if [[ -n "$unlogged_tables" ]]; then
710 log_warning "The following unlogged tables will NOT be backed up:"
711 while IFS= read -r table; do
712 log_warning " - $table"
713 done <<< "$unlogged_tables"
717 # Warning: Check for large objects
718 log_info "Checking for large objects..."
719 local large_object_count
720 large_object_count=$(query_db "SELECT count(*) FROM pg_largeobject_metadata;")
722 if [[ "$large_object_count" -gt 0 ]]; then
723 log_warning "Database contains $large_object_count large objects"
724 log_warning "Large objects are NOT incrementally backed up (only in full backups)"
725 log_warning "Consider using BYTEA columns instead for incremental backup support"
729 # Check if validation failed
730 if [[ "$validation_failed" -eq 1 ]]; then
731 if [[ "$FORCE" -eq 1 ]]; then
732 log_warning "Validation failed but --force specified, continuing anyway..."
734 log_error "Validation failed. Fix the CRITICAL issues above and try again."
735 log_error "Or use --force to skip validation (NOT recommended)."
736 exit "$EXIT_VALIDATION_ERROR"
739 log_success "All validation checks passed"
743 log_step "Phase 2: Setup"
745 # Create backup directory
746 log_info "Checking backup directory..."
747 if [[ ! -d "$BACKUP_DIR" ]]; then
748 if ! mkdir -p "$BACKUP_DIR"; then
749 log_error "Failed to create backup directory: $BACKUP_DIR"
750 exit "$EXIT_BACKUP_ERROR"
752 log_success "Created backup directory: $BACKUP_DIR"
754 # Directory exists - check if already initialized (has chains)
755 local existing_chains
756 existing_chains=$(find "$BACKUP_DIR" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | head -1)
758 if [[ -n "$existing_chains" ]]; then
759 if [[ "$IF_NOT_EXISTS" -eq 1 ]]; then
760 log_info "Backup directory already initialized (--if-not-exists specified)"
761 log_info "Skipping initialization"
764 log_error "Backup directory already initialized: $BACKUP_DIR"
765 log_error "Found existing chain(s)"
767 log_error "This directory has already been initialized with pg_scribe."
768 log_error "To create a new chain, use: pg_scribe --new-chain"
770 log_error "If you want to re-initialize from scratch:"
771 log_error " 1. Stop any running backup processes"
772 log_error " 2. Drop the replication slot (or verify it's safe to reuse)"
773 log_error " 3. Remove or rename the existing backup directory"
774 exit "$EXIT_VALIDATION_ERROR"
778 log_info "Using existing directory: $BACKUP_DIR"
781 # Generate chain ID and create chain directory
783 chain_id=$(get_chain_id)
784 local chain_dir="$BACKUP_DIR/chain-$chain_id"
786 log_info "Creating initial chain: $chain_id"
787 if ! mkdir -p "$chain_dir"; then
788 log_error "Failed to create chain directory: $chain_dir"
789 exit "$EXIT_BACKUP_ERROR"
792 # Create wal2sql extension
793 log_info "Creating wal2sql extension..."
794 if query_db_silent "CREATE EXTENSION IF NOT EXISTS wal2sql;"; then
795 log_success "wal2sql extension created (or already exists)"
797 log_error "Failed to create wal2sql extension"
798 log_error "Ensure wal2sql.so is installed in PostgreSQL's lib directory"
799 log_error "Run: cd wal2sql && make && make install"
800 exit "$EXIT_GENERAL_ERROR"
803 # Create replication slot with snapshot export
804 log_info "Creating logical replication slot '$SLOT'..."
806 # Check if slot already exists
807 check_replication_slot "$SLOT" 0
809 # Create slot using SQL
810 # Note: For POC, we create the slot and take the base backup sequentially
811 # The slot will preserve WAL from its creation LSN forward, ensuring no changes are lost
813 if ! slot_result=$(query_db "SELECT slot_name, lsn FROM pg_create_logical_replication_slot('$SLOT', 'wal2sql');"); then
814 log_error "Failed to create replication slot"
815 log_error "$slot_result"
816 exit "$EXIT_SLOT_ERROR"
819 CREATED_SLOT="$SLOT" # Track for cleanup
820 log_success "Replication slot '$SLOT' created"
822 # Take base backup immediately after slot creation
823 # The slot preserves WAL from its creation point, so all changes will be captured
824 local base_backup_file="$chain_dir/base.sql"
825 CREATED_FILES+=("$base_backup_file") # Track for cleanup
826 log_info "Taking base backup..."
829 mapfile -t psql_args < <(build_psql_args)
830 if pg_dump "${psql_args[@]}" --file="$base_backup_file"; then
832 base_size=$(get_file_size "$base_backup_file")
833 log_success "Base backup completed ($base_size)"
835 log_error "Base backup failed"
836 exit "$EXIT_BACKUP_ERROR"
839 # Take globals backup
840 local globals_backup_file
841 globals_backup_file=$(take_globals_backup "$chain_dir")
842 CREATED_FILES+=("$globals_backup_file") # Track for cleanup
844 # Generate metadata file
845 log_info "Generating metadata file..."
846 local metadata_file="$chain_dir/metadata.json"
847 CREATED_FILES+=("$metadata_file") # Track for cleanup
849 pg_version=$(query_db "SELECT version();")
851 cat > "$metadata_file" <<EOF
853 "chain_id": "$chain_id",
854 "created": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
855 "pg_scribe_version": "$VERSION",
856 "database": "$DBNAME",
857 "replication_slot": "$SLOT",
858 "postgresql_version": "$pg_version",
859 "encoding": "$(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")",
860 "collation": "$(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")"
864 log_success "Metadata file created"
866 # Disable cleanup trap on successful completion
871 log_step "Initialization Complete"
872 log_success "Initial chain created: $chain_id"
873 log_success "Location: $chain_dir"
874 log_success "Replication slot: $SLOT"
875 log_info "Next steps:"
876 log_info " 1. Start streaming incremental backups:"
877 log_info " pg_scribe --start -d $DBNAME -f $BACKUP_DIR"
878 log_info " 2. Monitor replication slot health:"
879 log_info " pg_scribe --status -d $DBNAME -S $SLOT -f $BACKUP_DIR"
881 if [[ "$has_warnings" -eq 1 ]]; then
889 # --start command implementation
892 log_step "Starting incremental backup collection"
894 # Validate required arguments
895 validate_required_args "start" "DBNAME:-d/--dbname" "BACKUP_DIR:-f/--file (backup directory)"
897 # Verify backup directory exists
898 if [[ ! -d "$BACKUP_DIR" ]]; then
899 log_error "Backup directory does not exist: $BACKUP_DIR"
900 log_error "Run --init first to initialize the backup system"
901 exit "$EXIT_BACKUP_ERROR"
905 log_step "Finding latest chain..."
907 latest_chain=$(find "$BACKUP_DIR" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | sort | tail -1)
909 if [[ -z "$latest_chain" ]]; then
910 log_error "No chains found in backup directory: $BACKUP_DIR"
911 log_error "Run --init first to create the initial chain"
912 exit "$EXIT_BACKUP_ERROR"
916 chain_id=$(basename "$latest_chain" | sed 's/^chain-//')
917 log_success "Found latest chain: $chain_id"
919 # Read replication slot from chain metadata
920 local metadata_file="$latest_chain/metadata.json"
921 if [[ ! -f "$metadata_file" ]]; then
922 log_error "Chain metadata not found: $metadata_file"
923 log_error "Chain may be corrupted. Create a new chain with:"
924 log_error " pg_scribe --new-chain -d $DBNAME -f $BACKUP_DIR"
925 exit "$EXIT_BACKUP_ERROR"
928 SLOT=$(grep '"replication_slot"' "$metadata_file" | cut -d'"' -f4)
929 if [[ -z "$SLOT" ]]; then
930 log_error "Chain metadata missing replication_slot field"
931 log_error "Chain may be corrupted. Create a new chain with:"
932 log_error " pg_scribe --new-chain -d $DBNAME -f $BACKUP_DIR"
933 exit "$EXIT_BACKUP_ERROR"
936 log_info "Using replication slot from metadata: $SLOT"
938 # Check for existing streaming process
939 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
940 if [[ -f "$pidfile" ]]; then
942 existing_pid=$(cat "$pidfile")
944 # Check if process is still running
945 if kill -0 "$existing_pid" 2>/dev/null; then
946 log_error "Already streaming to $BACKUP_DIR (PID $existing_pid)"
947 log_error "Stop the existing process first or use a different backup directory"
948 exit "$EXIT_GENERAL_ERROR"
950 log_info "Removing stale pidfile (process $existing_pid not running)"
958 # Verify replication slot exists
959 log_step "Verifying replication slot '$SLOT'..."
960 check_replication_slot "$SLOT" 1
962 # Determine output file
963 local output_file="$latest_chain/active.sql"
965 # Build pg_recvlogical arguments
966 local pg_recv_args=()
967 mapfile -t pg_recv_args < <(build_pg_recvlogical_args)
969 # Add required arguments
970 pg_recv_args+=(--slot="$SLOT")
971 pg_recv_args+=(--start)
972 pg_recv_args+=(--file="$output_file")
975 pg_recv_args+=(--option=include_transaction=on)
977 # Add status interval
978 pg_recv_args+=(--status-interval="$STATUS_INTERVAL")
980 # Add fsync interval (0 means disabled)
981 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
982 pg_recv_args+=(--fsync-interval="$FSYNC_INTERVAL")
984 # For fsync-interval=0, we skip the parameter to avoid pg_recvlogical errors
985 log_info "Fsync disabled (fsync-interval=0)"
988 # Display configuration
989 log_step "Configuration"
990 log_info "Database: $DBNAME"
991 log_info "Replication slot: $SLOT"
992 log_info "Chain: $chain_id"
993 log_info "Output file: $output_file"
994 log_info "Status interval: ${STATUS_INTERVAL}s"
995 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
996 log_info "Fsync interval: ${FSYNC_INTERVAL}s"
998 log_info "Fsync: disabled"
1002 # Write pidfile before exec (PID stays same after exec)
1003 echo $$ > "$pidfile"
1005 # Start streaming - replace this process with pg_recvlogical
1006 log_step "Starting streaming replication..."
1007 log_info "Press Ctrl+C to stop"
1008 log_info "Send SIGHUP to rotate output file"
1011 # Replace this process with pg_recvlogical
1012 # This eliminates signal forwarding issues and prevents orphaned processes
1013 # The PID stays the same, making cleanup in tests more reliable
1014 exec pg_recvlogical "${pg_recv_args[@]}"
1018 # --stop command implementation
1021 log_step "Stopping active streaming process"
1023 # Validate required arguments
1024 validate_required_args "stop" "BACKUP_DIR:-f/--file (backup directory)"
1026 # Verify backup directory exists
1027 if [[ ! -d "$BACKUP_DIR" ]]; then
1028 log_error "Backup directory does not exist: $BACKUP_DIR"
1029 exit "$EXIT_BACKUP_ERROR"
1032 # Find and validate pidfile
1033 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
1034 if [[ ! -f "$pidfile" ]]; then
1035 log_error "No active streaming process found"
1036 log_error "Pidfile not found: $pidfile"
1037 exit "$EXIT_GENERAL_ERROR"
1041 pid=$(cat "$pidfile")
1043 # Verify process is running
1044 if ! kill -0 "$pid" 2>/dev/null; then
1045 log_warning "Stale pidfile (process $pid not running)"
1046 log_info "Removing stale pidfile"
1048 exit "$EXIT_SUCCESS"
1051 # Verify process is pg_recvlogical
1053 proc_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "")
1054 if [[ "$proc_name" != "pg_recvlogical" ]]; then
1055 log_error "PID $pid is not pg_recvlogical (found: $proc_name)"
1056 log_error "Not stopping non-pg_recvlogical process"
1057 exit "$EXIT_GENERAL_ERROR"
1060 log_info "Found pg_recvlogical process (PID $pid)"
1064 active_file=$(find "$BACKUP_DIR"/chain-*/active.sql 2>/dev/null | head -1)
1066 if [[ -n "$active_file" ]]; then
1068 chain_dir=$(dirname "$active_file")
1070 chain_id=$(basename "$chain_dir" | sed 's/^chain-//')
1071 log_info "Active chain: $chain_id"
1074 # Send SIGTERM to gracefully stop the process
1075 log_info "Sending SIGTERM to process $pid..."
1078 # Wait for process to stop (with timeout)
1081 log_info "Waiting for process to stop..."
1083 while kill -0 "$pid" 2>/dev/null && [[ $waited -lt $timeout ]]; do
1085 waited=$((waited + 1))
1088 # Check if process stopped
1089 if kill -0 "$pid" 2>/dev/null; then
1090 log_warning "Process did not stop gracefully, sending SIGKILL..."
1091 kill -KILL "$pid" 2>/dev/null || true
1098 log_success "Streaming process stopped"
1099 exit "$EXIT_SUCCESS"
1103 # --rotate-diff command implementation
1106 log_step "Rotating differential file"
1108 # Validate required arguments
1109 validate_required_args "rotate-diff" "BACKUP_DIR:-f/--file (backup directory)"
1111 # Verify backup directory exists
1112 if [[ ! -d "$BACKUP_DIR" ]]; then
1113 log_error "Backup directory does not exist: $BACKUP_DIR"
1114 exit "$EXIT_BACKUP_ERROR"
1117 # Find and validate pidfile
1118 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
1119 if [[ ! -f "$pidfile" ]]; then
1120 log_error "No active streaming process found"
1121 log_error "Pidfile not found: $pidfile"
1122 exit "$EXIT_GENERAL_ERROR"
1126 pid=$(cat "$pidfile")
1128 # Verify process is running
1129 if ! kill -0 "$pid" 2>/dev/null; then
1130 log_error "Stale pidfile (process $pid not running)"
1131 log_error "Remove $pidfile and start streaming with --start"
1132 exit "$EXIT_GENERAL_ERROR"
1135 # Verify process is pg_recvlogical
1137 proc_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "")
1138 if [[ "$proc_name" != "pg_recvlogical" ]]; then
1139 log_error "PID $pid is not pg_recvlogical (found: $proc_name)"
1140 exit "$EXIT_GENERAL_ERROR"
1143 log_success "Found active streaming process (PID $pid)"
1145 # Find active.sql file
1147 active_file=$(find "$BACKUP_DIR"/chain-*/active.sql 2>/dev/null | head -1)
1149 if [[ -z "$active_file" ]]; then
1150 log_error "No active.sql found in any chain"
1151 exit "$EXIT_BACKUP_ERROR"
1155 chain_dir=$(dirname "$active_file")
1157 chain_id=$(basename "$chain_dir" | sed 's/^chain-//')
1159 log_success "Found active chain: $chain_id"
1161 # Generate differential timestamp
1162 local diff_timestamp
1163 diff_timestamp=$(get_chain_id)
1164 local sealed_file="$chain_dir/diff-$diff_timestamp.sql"
1166 # Get file size before rotation
1168 active_size=$(get_file_size "$active_file")
1171 log_info "Rotating active.sql to diff-$diff_timestamp.sql"
1173 # Rename active → diff (pg_recvlogical still has file open)
1174 if ! mv "$active_file" "$sealed_file"; then
1175 log_error "Failed to rename active.sql"
1176 exit "$EXIT_BACKUP_ERROR"
1179 # Send SIGHUP to trigger file rotation
1180 log_info "Sending SIGHUP to pg_recvlogical..."
1183 # Wait for new active.sql to appear
1186 log_info "Waiting for new active.sql..."
1188 while [[ $waited -lt $timeout ]]; do
1189 if [[ -f "$chain_dir/active.sql" ]]; then
1190 # Wait a moment to ensure it's being written
1192 if [[ -s "$chain_dir/active.sql" ]] || [[ -f "$chain_dir/active.sql" ]]; then
1193 log_success "Rotated differential: diff-$diff_timestamp.sql ($active_size)"
1194 log_success "New differential started"
1195 exit "$EXIT_SUCCESS"
1199 waited=$((waited + 1))
1202 log_error "Timeout waiting for new active.sql"
1203 log_error "The rotation may have failed - check pg_recvlogical process"
1204 exit "$EXIT_GENERAL_ERROR"
1208 # --new-chain command implementation
1211 log_step "Creating new chain"
1213 # Phase 1: Validation (no state changes)
1214 validate_required_args "new-chain" "DBNAME:-d/--dbname" "BACKUP_DIR:-f/--file (backup directory)"
1218 if [[ ! -d "$BACKUP_DIR" ]]; then
1219 log_error "Backup directory does not exist: $BACKUP_DIR"
1220 log_error "Run --init first to initialize the backup system"
1221 exit "$EXIT_BACKUP_ERROR"
1224 # Validate replication slot exists before creating chain
1225 log_step "Validating replication slot '$SLOT'..."
1226 check_replication_slot "$SLOT" 1
1228 # Validate compression method if specified
1229 if [[ -n "$COMPRESS" && "$COMPRESS" != "none" ]]; then
1230 local compress_type="${COMPRESS%%:*}"
1231 case "$compress_type" in
1233 if ! command -v gzip >/dev/null 2>&1; then
1234 log_error "Compression method 'gzip' requires gzip command"
1235 log_error "Install gzip or use a different compression method"
1236 exit "$EXIT_VALIDATION_ERROR"
1240 if ! command -v lz4 >/dev/null 2>&1; then
1241 log_error "Compression method 'lz4' requires lz4 command"
1242 log_error "Install lz4 or use a different compression method"
1243 exit "$EXIT_VALIDATION_ERROR"
1247 if ! command -v zstd >/dev/null 2>&1; then
1248 log_error "Compression method 'zstd' requires zstd command"
1249 log_error "Install zstd or use a different compression method"
1250 exit "$EXIT_VALIDATION_ERROR"
1254 log_error "Unknown compression method: $compress_type"
1255 log_error "Supported methods: gzip, lz4, zstd, none"
1256 exit "$EXIT_VALIDATION_ERROR"
1261 # Phase 2: Create new chain (validation complete)
1263 # Generate new chain ID and create directory
1265 new_chain_id=$(get_chain_id)
1266 local new_chain_dir="$BACKUP_DIR/chain-$new_chain_id"
1268 log_info "Creating new chain: $new_chain_id"
1269 if ! mkdir -p "$new_chain_dir"; then
1270 log_error "Failed to create chain directory: $new_chain_dir"
1271 exit "$EXIT_BACKUP_ERROR"
1274 # Take new base backup
1275 log_step "Taking base backup"
1276 local base_backup_file="$new_chain_dir/base.sql"
1279 mapfile -t psql_args < <(build_psql_args)
1281 # Check if compression is requested
1282 local pg_dump_args=("${psql_args[@]}")
1283 if [[ -n "$COMPRESS" ]]; then
1284 log_info "Compression: $COMPRESS"
1285 pg_dump_args+=(--compress="$COMPRESS")
1287 # Add appropriate file extension for compression
1288 local compress_type="${COMPRESS%%:*}"
1289 case "$compress_type" in
1291 base_backup_file="${base_backup_file}.gz"
1294 base_backup_file="${base_backup_file}.lz4"
1297 base_backup_file="${base_backup_file}.zst"
1300 # No compression, no extension
1303 log_error "Unknown compression method: $compress_type"
1304 log_error "Supported methods: gzip, lz4, zstd, none"
1305 exit "$EXIT_VALIDATION_ERROR"
1310 pg_dump_args+=(--file="$base_backup_file")
1312 if pg_dump "${pg_dump_args[@]}"; then
1314 base_size=$(get_file_size "$base_backup_file")
1315 log_success "Base backup completed ($base_size)"
1317 log_error "Base backup failed"
1318 # Clean up partial files
1319 rm -rf "$new_chain_dir" 2>/dev/null || true
1320 exit "$EXIT_BACKUP_ERROR"
1323 # Take globals backup
1324 local globals_backup_file
1325 globals_backup_file=$(take_globals_backup "$new_chain_dir")
1327 # Generate metadata file
1328 log_info "Generating metadata file..."
1329 local metadata_file="$new_chain_dir/metadata.json"
1331 pg_version=$(query_db "SELECT version();")
1333 cat > "$metadata_file" <<EOF
1335 "chain_id": "$new_chain_id",
1336 "created": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
1337 "pg_scribe_version": "$VERSION",
1338 "database": "$DBNAME",
1339 "replication_slot": "$SLOT",
1340 "postgresql_version": "$pg_version",
1341 "encoding": "$(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")",
1342 "collation": "$(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")"
1346 log_success "Metadata file created"
1348 # Check if streaming is active
1349 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
1350 local streaming_active=0
1353 if [[ -f "$pidfile" ]]; then
1354 old_pid=$(cat "$pidfile")
1355 if kill -0 "$old_pid" 2>/dev/null; then
1360 # Handle automatic transition if --start was specified
1361 if [[ "$AUTO_START" -eq 1 ]]; then
1363 log_step "Automatic Transition (--start specified)"
1365 # Stop old streaming process if active
1366 if [[ $streaming_active -eq 1 ]]; then
1367 # Identify which chain the old process was streaming to
1368 local old_chain_file=""
1369 local old_chain_dir=""
1370 local old_chain_id=""
1372 local pg_recv_cmdline
1373 pg_recv_cmdline=$(ps -p "$old_pid" -o args= 2>/dev/null || echo "")
1375 if [[ -n "$pg_recv_cmdline" ]]; then
1376 # Extract --file=... from command line
1377 old_chain_file=$(echo "$pg_recv_cmdline" | grep -oP '\-\-file=\K[^ ]+' || echo "")
1379 if [[ -n "$old_chain_file" ]] && [[ -f "$old_chain_file" ]]; then
1380 old_chain_dir=$(dirname "$old_chain_file")
1381 old_chain_id=$(basename "$old_chain_dir" | sed 's/^chain-//')
1382 log_info "Old chain was streaming to: $old_chain_id"
1386 log_info "Stopping old streaming process (PID $old_pid)..."
1387 kill -TERM "$old_pid"
1389 # Wait for process to stop (with timeout)
1392 log_info "Waiting for process to stop..."
1394 while kill -0 "$old_pid" 2>/dev/null && [[ $waited -lt $timeout ]]; do
1396 waited=$((waited + 1))
1399 # Check if process stopped
1400 if kill -0 "$old_pid" 2>/dev/null; then
1401 log_warning "Process did not stop gracefully, sending SIGKILL..."
1402 kill -KILL "$old_pid" 2>/dev/null || true
1406 log_success "Old streaming process stopped"
1408 # Seal the old chain's active.sql if it exists
1409 if [[ -n "$old_chain_dir" ]] && [[ -f "$old_chain_dir/active.sql" ]]; then
1410 log_step "Sealing old chain's active.sql"
1412 local diff_timestamp
1413 diff_timestamp=$(get_chain_id)
1414 local sealed_file="$old_chain_dir/diff-$diff_timestamp.sql"
1416 log_info "Sealing active.sql to diff-$diff_timestamp.sql"
1417 if mv "$old_chain_dir/active.sql" "$sealed_file"; then
1419 sealed_size=$(get_file_size "$sealed_file")
1420 log_success "Sealed old chain's differential: diff-$diff_timestamp.sql ($sealed_size)"
1422 log_warning "Failed to seal old chain's active.sql"
1427 # Now start streaming to new chain
1428 log_step "Starting streaming to new chain: $new_chain_id"
1430 # Find latest chain (should be the one we just created)
1432 latest_chain=$(find "$BACKUP_DIR" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | sort | tail -1)
1434 # Determine output file
1435 local output_file="$latest_chain/active.sql"
1437 # Build pg_recvlogical arguments
1438 local pg_recv_args=()
1439 mapfile -t pg_recv_args < <(build_pg_recvlogical_args)
1441 # Add required arguments
1442 pg_recv_args+=(--slot="$SLOT")
1443 pg_recv_args+=(--start)
1444 pg_recv_args+=(--file="$output_file")
1446 # Add plugin options
1447 pg_recv_args+=(--option=include_transaction=on)
1449 # Add status interval
1450 pg_recv_args+=(--status-interval="$STATUS_INTERVAL")
1452 # Add fsync interval (0 means disabled)
1453 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
1454 pg_recv_args+=(--fsync-interval="$FSYNC_INTERVAL")
1457 # Write pidfile before exec (PID stays same after exec)
1458 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
1459 echo $$ > "$pidfile"
1461 log_info "Output file: $output_file"
1462 log_info "Press Ctrl+C to stop"
1465 # Replace this process with pg_recvlogical
1466 exec pg_recvlogical "${pg_recv_args[@]}"
1469 # Final summary (only reached if --start was NOT specified)
1471 log_step "New Chain Complete"
1472 log_success "Chain created: $new_chain_id"
1473 log_success "Location: $new_chain_dir"
1474 log_success "Base backup: $(basename "$base_backup_file")"
1476 if [[ $streaming_active -eq 1 ]]; then
1478 log_info "Active streaming process detected (PID $old_pid)"
1480 log_info "To transition to the new chain:"
1481 log_info " 1. Stop the current streaming process:"
1482 log_info " kill -TERM $old_pid"
1483 log_info " # Or: pg_scribe --stop -f $BACKUP_DIR"
1484 log_info " 2. Start streaming to the new chain:"
1485 log_info " pg_scribe --start -d $DBNAME -f $BACKUP_DIR"
1487 log_info "Or use --rotate-diff to seal a differential before transitioning"
1490 log_info "No active streaming process detected"
1492 log_info "Start streaming to the new chain:"
1493 log_info " pg_scribe --start -d $DBNAME -f $BACKUP_DIR"
1496 exit "$EXIT_SUCCESS"
1500 # --restore command implementation
1503 log_step "Restoring database from backup"
1505 # Validate required arguments
1506 validate_required_args "restore" "DBNAME:-d/--dbname (target database)" "BACKUP_DIR:-f/--file (backup directory)"
1508 # Verify backup directory exists
1509 if [[ ! -d "$BACKUP_DIR" ]]; then
1510 log_error "Backup directory does not exist: $BACKUP_DIR"
1511 exit "$EXIT_BACKUP_ERROR"
1514 # Determine target chain
1515 log_step "Locating chain"
1519 if [[ -n "$BASE_BACKUP" ]]; then
1520 # BASE_BACKUP can be a chain ID or a specific chain directory
1521 if [[ -d "$BACKUP_DIR/chain-$BASE_BACKUP" ]]; then
1522 chain_dir="$BACKUP_DIR/chain-$BASE_BACKUP"
1523 chain_id="$BASE_BACKUP"
1524 log_info "Using specified chain: $chain_id"
1525 elif [[ -d "$BASE_BACKUP" ]] && [[ "$(basename "$BASE_BACKUP")" =~ ^chain- ]]; then
1526 chain_dir="$BASE_BACKUP"
1527 chain_id=$(basename "$chain_dir" | sed 's/^chain-//')
1528 log_info "Using specified chain directory: $chain_id"
1530 log_error "Chain not found: $BASE_BACKUP"
1531 exit "$EXIT_BACKUP_ERROR"
1535 chain_dir=$(find "$BACKUP_DIR" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | sort | tail -1)
1537 if [[ -z "$chain_dir" ]]; then
1538 log_error "No chains found in backup directory: $BACKUP_DIR"
1539 log_error "Run --init first to create a chain"
1540 exit "$EXIT_BACKUP_ERROR"
1543 chain_id=$(basename "$chain_dir" | sed 's/^chain-//')
1544 log_success "Found latest chain: $chain_id"
1547 # Verify chain structure
1548 local base_backup_path="$chain_dir/base.sql"
1549 local globals_backup_path="$chain_dir/globals.sql"
1551 # Check for compressed base backup
1552 if [[ ! -f "$base_backup_path" ]]; then
1553 # Try compressed variants
1554 if [[ -f "$base_backup_path.gz" ]]; then
1555 base_backup_path="$base_backup_path.gz"
1556 elif [[ -f "$base_backup_path.lz4" ]]; then
1557 base_backup_path="$base_backup_path.lz4"
1558 elif [[ -f "$base_backup_path.zst" ]]; then
1559 base_backup_path="$base_backup_path.zst"
1561 log_error "Base backup not found in chain: $chain_id"
1562 exit "$EXIT_BACKUP_ERROR"
1566 log_info "Base backup: $(basename "$base_backup_path")"
1568 # Find all sealed differentials (sorted by timestamp)
1569 local differential_files=()
1570 mapfile -t differential_files < <(find "$chain_dir" -maxdepth 1 -name 'diff-*.sql' 2>/dev/null | sort)
1572 if [[ ${#differential_files[@]} -gt 0 ]]; then
1573 log_info "Found ${#differential_files[@]} sealed differential(s)"
1575 log_info "No sealed differentials found (will restore base backup only)"
1578 # Create target database if requested
1579 if [[ "$CREATE_DB" -eq 1 ]]; then
1580 log_step "Creating target database"
1582 # Connect to postgres database (not target database) to create it
1583 local create_dbname="$DBNAME"
1586 # Test connection to postgres database
1589 # Check if database already exists
1591 db_exists=$(query_db "SELECT count(*) FROM pg_database WHERE datname = '$create_dbname';")
1593 if [[ "$db_exists" -gt 0 ]]; then
1594 log_error "Database '$create_dbname' already exists"
1595 log_error "Drop it first or omit --create flag to restore into existing database"
1596 exit "$EXIT_BACKUP_ERROR"
1600 if query_db_silent "CREATE DATABASE \"$create_dbname\";"; then
1601 log_success "Created database: $create_dbname"
1603 log_error "Failed to create database: $create_dbname"
1604 exit "$EXIT_BACKUP_ERROR"
1607 # Switch back to target database for subsequent operations
1608 DBNAME="$create_dbname"
1611 # Test connection to target database
1614 # Restore globals backup
1615 if [[ -f "$globals_backup_path" ]]; then
1616 log_step "Restoring globals (roles, tablespaces)"
1618 # Build connection args for psql
1619 # Note: globals must be restored to postgres database, not target database
1620 local save_dbname="$DBNAME"
1623 mapfile -t psql_args < <(build_psql_args)
1624 DBNAME="$save_dbname"
1626 if psql "${psql_args[@]}" -f "$globals_backup_path" >/dev/null 2>&1; then
1627 log_success "Globals restored successfully"
1629 log_warning "Globals restore had errors (may be expected if roles already exist)"
1632 log_warning "No globals backup found in chain (roles and tablespaces will not be restored)"
1635 # Restore base backup
1636 log_step "Restoring base backup"
1638 start_time=$(date +%s)
1641 mapfile -t psql_args < <(build_psql_args)
1643 # Handle compressed backups
1644 if [[ "$base_backup_path" == *.gz ]]; then
1645 log_info "Decompressing gzip backup..."
1646 if gunzip -c "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1647 log_success "Base backup restored successfully"
1649 log_error "Base backup restore failed"
1650 exit "$EXIT_BACKUP_ERROR"
1652 elif [[ "$base_backup_path" == *.zst ]]; then
1653 log_info "Decompressing zstd backup..."
1654 if zstd -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1655 log_success "Base backup restored successfully"
1657 log_error "Base backup restore failed"
1658 exit "$EXIT_BACKUP_ERROR"
1660 elif [[ "$base_backup_path" == *.lz4 ]]; then
1661 log_info "Decompressing lz4 backup..."
1662 if lz4 -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1663 log_success "Base backup restored successfully"
1665 log_error "Base backup restore failed"
1666 exit "$EXIT_BACKUP_ERROR"
1669 # Uncompressed backup
1670 if psql "${psql_args[@]}" -f "$base_backup_path" >/dev/null 2>&1; then
1671 log_success "Base backup restored successfully"
1673 log_error "Base backup restore failed"
1674 exit "$EXIT_BACKUP_ERROR"
1678 # Apply sealed differentials
1679 if [[ ${#differential_files[@]} -gt 0 ]]; then
1680 log_step "Applying sealed differentials"
1683 for diff_file in "${differential_files[@]}"; do
1684 log_info "Applying: $(basename "$diff_file")"
1686 # Optimize for bulk restore: disable synchronous_commit to avoid fsync() on every COMMIT
1687 # This is safe during restore - we can always re-run if it fails
1688 if psql "${psql_args[@]}" -c "SET synchronous_commit = off;" -f "$diff_file" >/dev/null 2>&1; then
1689 if [[ "$VERBOSE" -eq 1 ]]; then
1690 log_success "Applied: $(basename "$diff_file")"
1692 diff_count=$((diff_count + 1))
1694 log_error "Failed to apply differential: $(basename "$diff_file")"
1695 log_error "Restore is incomplete"
1696 exit "$EXIT_BACKUP_ERROR"
1700 log_success "Applied $diff_count sealed differential(s)"
1703 # Apply active.sql if requested (WARNING: may be incomplete)
1704 if [[ "$INCLUDE_ACTIVE" -eq 1 ]] && [[ -f "$chain_dir/active.sql" ]]; then
1705 log_step "Applying active.sql (INCOMPLETE DATA WARNING)"
1706 log_warning "active.sql may contain incomplete transactions!"
1709 mapfile -t psql_args < <(build_psql_args)
1711 if psql "${psql_args[@]}" -f "$chain_dir/active.sql" >/dev/null 2>&1; then
1712 log_warning "Applied incomplete active.sql - verify data integrity!"
1714 log_error "Failed to apply active.sql"
1715 exit "$EXIT_BACKUP_ERROR"
1717 elif [[ "$INCLUDE_ACTIVE" -eq 1 ]]; then
1718 log_warning "No active.sql found in chain (--include-active was specified)"
1721 # Synchronize sequences
1722 if [[ "$NO_SYNC_SEQUENCES" -eq 0 ]]; then
1723 log_step "Synchronizing sequences"
1725 # Query all sequences and their associated tables
1727 seq_sync_sql=$(query_db "
1730 quote_literal(sn.nspname || '.' || s.relname) ||
1731 ', GREATEST((SELECT COALESCE(MAX(' ||
1732 quote_ident(a.attname) ||
1734 quote_ident(tn.nspname) || '.' || quote_ident(t.relname) ||
1737 JOIN pg_namespace sn ON sn.oid = s.relnamespace
1738 JOIN pg_depend d ON d.objid = s.oid AND d.deptype = 'a'
1739 JOIN pg_class t ON t.oid = d.refobjid
1740 JOIN pg_namespace tn ON tn.oid = t.relnamespace
1741 JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = d.refobjsubid
1742 WHERE s.relkind = 'S'
1743 AND sn.nspname NOT IN ('pg_catalog', 'information_schema')
1744 ORDER BY sn.nspname, s.relname;
1747 if [[ -n "$seq_sync_sql" ]]; then
1749 while IFS= read -r sync_cmd; do
1750 if query_db_silent "$sync_cmd"; then
1751 seq_count=$((seq_count + 1))
1752 if [[ "$VERBOSE" -eq 1 ]]; then
1753 log_info "Synced sequence: $(echo "$sync_cmd" | grep -oP "'\K[^']+(?=')")"
1756 log_warning "Failed to sync sequence: $sync_cmd"
1758 done <<< "$seq_sync_sql"
1760 log_success "Synchronized $seq_count sequence(s)"
1762 log_info "No sequences found to synchronize"
1765 log_info "Skipping sequence synchronization (--no-sync-sequences specified)"
1768 # Calculate restore duration
1770 end_time=$(date +%s)
1771 local duration=$((end_time - start_time))
1774 log_step "Restore Statistics"
1776 # Count rows in all tables
1777 log_info "Counting rows in restored tables..."
1779 table_count=$(query_db "SELECT count(*) FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relkind = 'r' AND n.nspname NOT IN ('pg_catalog', 'information_schema');" 2>/dev/null)
1782 total_rows=$(query_db "
1783 SELECT COALESCE(SUM(n_live_tup), 0)
1784 FROM pg_stat_user_tables;
1787 echo -e "${BOLD}Database:${RESET} $DBNAME" >&2
1788 echo -e "${BOLD}Chain:${RESET} $chain_id" >&2
1789 echo -e "${BOLD}Tables Restored:${RESET} $table_count" >&2
1790 echo -e "${BOLD}Total Rows:${RESET} $total_rows (approximate)" >&2
1791 echo -e "${BOLD}Duration:${RESET} ${duration}s" >&2
1792 echo -e "${BOLD}Base Backup:${RESET} $(basename "$base_backup_path")" >&2
1794 if [[ ${#differential_files[@]} -gt 0 ]]; then
1795 echo -e "${BOLD}Differentials Applied:${RESET} ${#differential_files[@]}" >&2
1798 if [[ "$INCLUDE_ACTIVE" -eq 1 ]] && [[ -f "$chain_dir/active.sql" ]]; then
1799 echo -e "${BOLD}Included active.sql:${RESET} ${YELLOW}YES (incomplete data)${RESET}" >&2
1802 # Final success message
1804 log_step "Restore Complete"
1805 log_success "Database successfully restored to: $DBNAME"
1806 log_info "Next steps:"
1807 log_info " 1. Verify data integrity:"
1808 log_info " psql -d $DBNAME -c 'SELECT COUNT(*) FROM <your_table>;'"
1809 log_info " 2. Run application smoke tests"
1810 log_info " 3. Switch application to restored database"
1812 exit "$EXIT_SUCCESS"
1816 # --status command implementation
1819 log_step "Checking pg_scribe backup system status"
1821 # Validate required arguments
1822 validate_required_args "status" "DBNAME:-d/--dbname"
1827 # Track warnings for exit code
1828 local has_warnings=0
1830 # Check replication slot status
1831 log_step "Replication Slot Status"
1833 # Verify replication slot exists
1834 check_replication_slot "$SLOT" 1
1836 # Query slot details
1838 slot_info=$(query_db "
1845 confirmed_flush_lsn,
1846 pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) as restart_lag_bytes,
1847 pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) as confirmed_lag_bytes,
1848 pg_current_wal_lsn() as current_lsn
1849 FROM pg_replication_slots
1850 WHERE slot_name = '$SLOT';
1854 IFS='|' read -r slot_name slot_type db_name active restart_lsn confirmed_flush_lsn restart_lag_bytes confirmed_lag_bytes current_lsn <<< "$slot_info"
1856 # Display slot information
1857 echo -e "${BOLD}Slot Name:${RESET} $slot_name" >&2
1858 echo -e "${BOLD}Slot Type:${RESET} $slot_type" >&2
1859 echo -e "${BOLD}Database:${RESET} $db_name" >&2
1861 if [[ "$active" == "t" ]]; then
1862 echo -e "${BOLD}Active:${RESET} ${GREEN}Yes${RESET}" >&2
1864 echo -e "${BOLD}Active:${RESET} ${YELLOW}No${RESET}" >&2
1865 log_warning "Replication slot is not active"
1869 echo -e "${BOLD}Current WAL LSN:${RESET} $current_lsn" >&2
1870 echo -e "${BOLD}Restart LSN:${RESET} $restart_lsn" >&2
1871 echo -e "${BOLD}Confirmed LSN:${RESET} $confirmed_flush_lsn" >&2
1873 # Format lag in human-readable sizes
1874 local restart_lag_mb=$((restart_lag_bytes / 1024 / 1024))
1875 local confirmed_lag_mb=$((confirmed_lag_bytes / 1024 / 1024))
1877 # Check lag thresholds (based on design doc)
1878 if [[ "$restart_lag_bytes" -gt 10737418240 ]]; then
1880 echo -e "${BOLD}Restart Lag:${RESET} ${RED}${restart_lag_mb} MB (CRITICAL!)${RESET}" >&2
1881 log_error "CRITICAL: Replication lag exceeds 10GB!"
1882 log_error " This may cause disk space issues or database shutdown"
1883 log_error " Consider dropping the slot if backup collection has stopped"
1885 elif [[ "$restart_lag_bytes" -gt 1073741824 ]]; then
1887 echo -e "${BOLD}Restart Lag:${RESET} ${YELLOW}${restart_lag_mb} MB (WARNING)${RESET}" >&2
1888 log_warning "Replication lag exceeds 1GB"
1889 log_warning " Ensure backup collection is running and healthy"
1892 echo -e "${BOLD}Restart Lag:${RESET} ${GREEN}${restart_lag_mb} MB${RESET}" >&2
1895 if [[ "$confirmed_lag_bytes" -gt 10737418240 ]]; then
1896 echo -e "${BOLD}Confirmed Lag:${RESET} ${RED}${confirmed_lag_mb} MB (CRITICAL!)${RESET}" >&2
1898 elif [[ "$confirmed_lag_bytes" -gt 1073741824 ]]; then
1899 echo -e "${BOLD}Confirmed Lag:${RESET} ${YELLOW}${confirmed_lag_mb} MB (WARNING)${RESET}" >&2
1902 echo -e "${BOLD}Confirmed Lag:${RESET} ${GREEN}${confirmed_lag_mb} MB${RESET}" >&2
1905 # Check slot age (if we can determine it)
1906 # Note: pg_replication_slots doesn't directly track creation time, but we can estimate from WAL
1909 # Analyze backup directory if provided
1910 if [[ -n "$BACKUP_DIR" ]]; then
1911 log_step "Chain Inventory"
1913 if [[ ! -d "$BACKUP_DIR" ]]; then
1914 log_warning "Backup directory does not exist: $BACKUP_DIR"
1919 mapfile -t chains < <(find "$BACKUP_DIR" -maxdepth 1 -type d -name 'chain-*' 2>/dev/null | sort)
1921 if [[ ${#chains[@]} -eq 0 ]]; then
1922 log_warning "No chains found in backup directory"
1923 log_warning " Run --init to create the initial chain"
1926 echo -e "${BOLD}Backup Directory:${RESET} $BACKUP_DIR" >&2
1929 # Determine which chain is active
1930 local pidfile="$BACKUP_DIR/.pg_scribe.pid"
1931 local active_chain_id=""
1934 if [[ -f "$pidfile" ]]; then
1936 pid=$(cat "$pidfile")
1937 if kill -0 "$pid" 2>/dev/null; then
1938 # Get which file the process is actually writing to from its command line
1939 local pg_recv_cmdline
1940 pg_recv_cmdline=$(ps -p "$pid" -o args= 2>/dev/null)
1942 # Extract --file=... from command line
1944 active_file=$(echo "$pg_recv_cmdline" | grep -oP '\-\-file=\K[^ ]+' || echo "")
1946 if [[ -n "$active_file" ]] && [[ -f "$active_file" ]]; then
1947 active_chain_id=$(basename "$(dirname "$active_file")" | sed 's/^chain-//')
1953 # Display each chain
1954 for chain_dir in "${chains[@]}"; do
1956 chain_id=$(basename "$chain_dir" | sed 's/^chain-//')
1959 local base_backup="$chain_dir/base.sql"
1962 # Check for compressed variants
1963 if [[ -f "$base_backup" ]]; then
1964 base_size=$(get_file_size "$base_backup")
1965 elif [[ -f "$base_backup.gz" ]]; then
1966 base_size=$(get_file_size "$base_backup.gz")
1967 elif [[ -f "$base_backup.lz4" ]]; then
1968 base_size=$(get_file_size "$base_backup.lz4")
1969 elif [[ -f "$base_backup.zst" ]]; then
1970 base_size=$(get_file_size "$base_backup.zst")
1974 diff_count=$(find "$chain_dir" -maxdepth 1 -name 'diff-*.sql' 2>/dev/null | wc -l)
1977 total_size=$(du -sh "$chain_dir" 2>/dev/null | cut -f1)
1979 # Check if this chain is active
1980 if [[ "$chain_id" == "$active_chain_id" ]]; then
1981 echo -e " ${GREEN}chain-$chain_id${RESET} ${BOLD}(ACTIVE - streaming)${RESET}" >&2
1982 echo -e " ${BOLD}PID:${RESET} $active_pid" >&2
1984 echo -e " chain-$chain_id" >&2
1987 echo -e " ${BOLD}Base backup:${RESET} $base_size" >&2
1988 echo -e " ${BOLD}Differentials:${RESET} $diff_count sealed" >&2
1989 echo -e " ${BOLD}Total size:${RESET} $total_size" >&2
1991 # Show last activity if active.sql exists
1992 if [[ -f "$chain_dir/active.sql" ]]; then
1994 last_mod=$(stat -c %y "$chain_dir/active.sql" 2>/dev/null | cut -d. -f1)
1996 age_seconds=$(( $(date +%s) - $(stat -c %Y "$chain_dir/active.sql" 2>/dev/null) ))
1997 local age_minutes=$((age_seconds / 60))
1999 echo -e " ${BOLD}Last activity:${RESET} $last_mod ($age_minutes minutes ago)" >&2
2001 # Warn if last activity is old (only for active chain)
2002 if [[ "$chain_id" == "$active_chain_id" ]] && [[ "$age_minutes" -gt 60 ]]; then
2003 log_warning "Active chain has no activity for ${age_minutes} minutes"
2004 log_warning " Verify that streaming is working correctly"
2012 # Calculate total backup directory size
2014 total_size=$(du -sh "$BACKUP_DIR" 2>/dev/null | cut -f1)
2015 echo -e "${BOLD}Total Backup Size:${RESET} $total_size" >&2
2020 # Overall health summary
2022 log_step "Health Summary"
2024 if [[ "$has_warnings" -eq 0 ]]; then
2025 log_success "System is healthy"
2027 log_info "Replication slot is active and lag is acceptable"
2028 if [[ -n "$BACKUP_DIR" ]]; then
2029 log_info "Backup directory appears healthy"
2031 exit "$EXIT_SUCCESS"
2033 log_warning "System has warnings - review messages above"
2035 log_info "Address any CRITICAL or WARNING issues promptly"
2036 log_info "See design doc for monitoring recommendations"
2037 exit "$EXIT_WARNING"
2068 log_error "Unknown action: $ACTION"
2069 exit "$EXIT_GENERAL_ERROR"
2074 # Run main with all arguments