3 # pg_scribe - Incremental SQL backup system for PostgreSQL
5 # This script provides a unified CLI for managing PostgreSQL backups
6 # using logical replication and plain SQL format.
16 EXIT_CONNECTION_ERROR=2
19 EXIT_VALIDATION_ERROR=5
23 DEFAULT_SLOT="pg_scribe"
25 DEFAULT_HOST="localhost"
26 DEFAULT_STATUS_INTERVAL=10
27 DEFAULT_FSYNC_INTERVAL=10
32 HOST="${PGHOST:-$DEFAULT_HOST}"
33 PORT="${PGPORT:-$DEFAULT_PORT}"
34 USERNAME="${PGUSER:-${USER:-}}"
37 STATUS_INTERVAL="$DEFAULT_STATUS_INTERVAL"
38 FSYNC_INTERVAL="$DEFAULT_FSYNC_INTERVAL"
48 # Color output support
49 if [[ "${PG_COLOR:-auto}" == "always" ]] || [[ "${PG_COLOR:-auto}" == "auto" && -t 2 ]]; then
65 # Logging functions (output to stderr)
67 echo -e "${BLUE}INFO:${RESET} $*" >&2
71 echo -e "${GREEN}SUCCESS:${RESET} $*" >&2
75 echo -e "${YELLOW}WARNING:${RESET} $*" >&2
79 echo -e "${RED}ERROR:${RESET} $*" >&2
83 echo -e "${BOLD}==>${RESET} $*" >&2
89 pg_scribe - Incremental SQL backup system for PostgreSQL
92 pg_scribe --init [OPTIONS]
93 pg_scribe --start [OPTIONS]
94 pg_scribe --full-backup [OPTIONS]
95 pg_scribe --restore [OPTIONS]
96 pg_scribe --status [OPTIONS]
100 Actions (exactly one required):
101 --init Initialize backup system
102 --start Start streaming incremental backups
103 --full-backup Take a full backup
104 --restore Restore from backups
105 --status Check replication slot status
106 -V, --version Print version and exit
107 -?, --help Show this help and exit
110 -d, --dbname=DBNAME Database name (can be connection string)
111 -h, --host=HOSTNAME Database server host (default: $DEFAULT_HOST)
112 -p, --port=PORT Database server port (default: $DEFAULT_PORT)
113 -U, --username=NAME Database user (default: \$PGUSER or \$USER)
114 -w, --no-password Never prompt for password
115 -W, --password Force password prompt
118 -v, --verbose Enable verbose mode
121 -f, --file=DIRECTORY Backup output directory (required)
122 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
123 --force Skip validation and force initialization
126 -f, --file=FILENAME Output file (use '-' for stdout, required)
127 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
128 -s, --status-interval=SECS Status update interval (default: $DEFAULT_STATUS_INTERVAL)
129 -F, --fsync-interval=SECS Fsync interval (default: $DEFAULT_FSYNC_INTERVAL, 0 to disable)
131 Options for --full-backup:
132 -f, --file=DIRECTORY Backup output directory (required)
133 -Z, --compress=METHOD Compression: gzip, lz4, zstd, or none (default: gzip)
135 Options for --restore:
136 -f, --file=DIRECTORY Backup input directory (required)
137 -d, --dbname=DBNAME Target database name (required)
138 -C, --create Create target database
139 --base-backup=FILE Specific base backup file (default: latest)
140 --no-sync-sequences Skip sequence synchronization
142 Options for --status:
143 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
144 -f, --file=DIRECTORY Backup directory to analyze (optional)
149 2 Database connection error
150 3 Replication slot error
151 4 Backup/restore error
152 5 Invalid arguments or validation failure
154 Environment Variables:
155 PGHOST, PGPORT, PGDATABASE, PGUSER, PGPASSWORD, PG_COLOR
157 Report bugs to: https://github.com/your-repo/pg_scribe/issues
161 # Parse command line arguments
163 if [[ $# -eq 0 ]]; then
165 exit "$EXIT_VALIDATION_ERROR"
168 while [[ $# -gt 0 ]]; do
171 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
176 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
181 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
186 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
191 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
196 echo "pg_scribe $VERSION"
251 -s|--status-interval)
256 STATUS_INTERVAL="${1#*=}"
264 FSYNC_INTERVAL="${1#*=}"
284 BASE_BACKUP="${1#*=}"
308 log_error "Unknown option: $1"
310 exit "$EXIT_VALIDATION_ERROR"
315 # Validate action was specified
316 if [[ -z "$ACTION" ]]; then
317 log_error "No action specified"
319 exit "$EXIT_VALIDATION_ERROR"
322 # Use PGDATABASE if dbname not specified
323 if [[ -z "$DBNAME" && -n "${PGDATABASE:-}" ]]; then
328 # Build psql connection string
332 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
333 [[ -n "$HOST" ]] && args+=(-h "$HOST")
334 [[ -n "$PORT" ]] && args+=(-p "$PORT")
335 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
336 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
337 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
339 printf '%s\n' "${args[@]}"
342 # Build pg_recvlogical connection string
343 build_pg_recvlogical_args() {
346 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
347 [[ -n "$HOST" ]] && args+=(-h "$HOST")
348 [[ -n "$PORT" ]] && args+=(-p "$PORT")
349 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
350 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
351 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
353 printf '%s\n' "${args[@]}"
356 # Build pg_dumpall connection arguments (no -d flag)
357 build_pg_dumpall_args() {
360 [[ -n "$HOST" ]] && args+=(-h "$HOST")
361 [[ -n "$PORT" ]] && args+=(-p "$PORT")
362 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
363 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
364 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
366 printf '%s\n' "${args[@]}"
369 # Generate standardized backup timestamp
370 get_backup_timestamp() {
374 # Test database connection
376 log_step "Testing database connection..."
379 mapfile -t psql_args < <(build_psql_args)
381 if ! psql "${psql_args[@]}" -c "SELECT version();" >/dev/null 2>&1; then
382 log_error "Failed to connect to database"
383 log_error "Connection details: host=$HOST port=$PORT dbname=$DBNAME user=$USERNAME"
384 exit "$EXIT_CONNECTION_ERROR"
387 if [[ "$VERBOSE" -eq 1 ]]; then
388 log_success "Connected to database"
392 # Execute SQL query and return result
396 mapfile -t psql_args < <(build_psql_args)
397 psql "${psql_args[@]}" -t -A -c "$sql" 2>&1
400 # Execute SQL query silently (return exit code only)
404 mapfile -t psql_args < <(build_psql_args)
405 psql "${psql_args[@]}" -t -A -c "$sql" >/dev/null 2>&1
408 # Take a globals backup (roles, tablespaces, etc.)
410 # $1 - backup directory path
411 # $2 - optional: timestamp (if not provided, generates new one)
413 # Echoes the path to the created globals backup file
414 # Exits script on failure
415 take_globals_backup() {
416 local backup_dir="$1"
417 local timestamp="${2:-$(get_backup_timestamp)}"
418 local globals_backup_file="$backup_dir/globals-${timestamp}.sql"
420 log_info "Taking globals backup: $globals_backup_file"
422 # Build pg_dumpall connection arguments
424 mapfile -t dumpall_args < <(build_pg_dumpall_args)
426 # Add globals-only flag and output file
427 dumpall_args+=(--globals-only)
428 dumpall_args+=(--file="$globals_backup_file")
430 if pg_dumpall "${dumpall_args[@]}"; then
432 globals_size=$(du -h "$globals_backup_file" 2>/dev/null | cut -f1)
433 log_success "Globals backup completed: $globals_backup_file ($globals_size)"
434 echo "$globals_backup_file"
436 log_error "Globals backup failed"
437 # Clean up partial file
438 rm -f "$globals_backup_file" 2>/dev/null || true
439 exit "$EXIT_BACKUP_ERROR"
444 # --init command implementation
447 log_step "Initializing pg_scribe backup system"
449 # Validate required arguments
450 if [[ -z "$DBNAME" ]]; then
451 log_error "--init requires -d/--dbname"
452 exit "$EXIT_VALIDATION_ERROR"
455 if [[ -z "$FILE" ]]; then
456 log_error "--init requires -f/--file (backup directory)"
457 exit "$EXIT_VALIDATION_ERROR"
460 # Cleanup tracking for failure handling
461 local CREATED_SLOT=""
462 local CREATED_FILES=()
464 # Cleanup function for handling failures
465 # shellcheck disable=SC2317 # Function called via trap handler
466 cleanup_on_failure() {
469 # Only cleanup on actual failure, not on successful exit
470 if [[ $exit_code -ne 0 && $exit_code -ne $EXIT_WARNING ]]; then
471 log_info "Cleaning up after failed initialization..."
473 # Drop replication slot if we created it
474 if [[ -n "$CREATED_SLOT" ]]; then
475 log_info "Dropping replication slot '$CREATED_SLOT'..."
476 query_db "SELECT pg_drop_replication_slot('$CREATED_SLOT');" 2>/dev/null || true
479 # Remove files we created
480 for file in "${CREATED_FILES[@]}"; do
481 if [[ -f "$file" ]]; then
482 log_info "Removing partial file: $file"
483 rm -f "$file" 2>/dev/null || true
487 log_info "Cleanup complete"
491 # Set up cleanup trap
492 trap cleanup_on_failure EXIT INT TERM
494 # Test connection first
497 # Phase 1: Validation
498 log_step "Phase 1: Validation"
500 local validation_failed=0
504 log_info "Checking wal_level configuration..."
506 wal_level=$(query_db "SHOW wal_level;")
507 if [[ "$wal_level" != "logical" ]]; then
508 log_error "CRITICAL: wal_level is '$wal_level', must be 'logical'"
509 log_error " Fix: Add 'wal_level = logical' to postgresql.conf and restart PostgreSQL"
512 if [[ "$VERBOSE" -eq 1 ]]; then
513 log_success "wal_level = logical"
517 # Check max_replication_slots
518 log_info "Checking max_replication_slots configuration..."
520 max_slots=$(query_db "SHOW max_replication_slots;")
521 if [[ "$max_slots" -lt 1 ]]; then
522 log_error "CRITICAL: max_replication_slots is $max_slots, must be >= 1"
523 log_error " Fix: Add 'max_replication_slots = 10' to postgresql.conf and restart PostgreSQL"
526 if [[ "$VERBOSE" -eq 1 ]]; then
527 log_success "max_replication_slots = $max_slots"
531 # Check max_wal_senders
532 log_info "Checking max_wal_senders configuration..."
534 max_senders=$(query_db "SHOW max_wal_senders;")
535 if [[ "$max_senders" -lt 1 ]]; then
536 log_error "CRITICAL: max_wal_senders is $max_senders, must be >= 1"
537 log_error " Fix: Add 'max_wal_senders = 10' to postgresql.conf and restart PostgreSQL"
540 if [[ "$VERBOSE" -eq 1 ]]; then
541 log_success "max_wal_senders = $max_senders"
545 # Check replica identity on all tables
546 log_info "Checking replica identity for all tables..."
548 bad_tables=$(query_db "
549 SELECT n.nspname || '.' || c.relname
551 JOIN pg_namespace n ON n.oid = c.relnamespace
552 WHERE c.relkind = 'r'
553 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
554 AND c.relreplident IN ('d', 'n')
556 SELECT 1 FROM pg_index i
557 WHERE i.indrelid = c.oid AND i.indisprimary
559 ORDER BY n.nspname, c.relname;
562 if [[ -n "$bad_tables" ]]; then
563 log_error "CRITICAL: The following tables lack adequate replica identity:"
564 while IFS= read -r table; do
565 log_error " - $table"
566 done <<< "$bad_tables"
567 log_error " Fix: Add a primary key or set replica identity:"
568 log_error " ALTER TABLE <table> ADD PRIMARY KEY (id);"
569 log_error " -- OR --"
570 log_error " ALTER TABLE <table> REPLICA IDENTITY FULL;"
573 if [[ "$VERBOSE" -eq 1 ]]; then
574 log_success "All tables have adequate replica identity"
578 # Warning: Check for unlogged tables
579 log_info "Checking for unlogged tables..."
580 local unlogged_tables
581 unlogged_tables=$(query_db "
582 SELECT n.nspname || '.' || c.relname
584 JOIN pg_namespace n ON n.oid = c.relnamespace
585 WHERE c.relkind = 'r'
586 AND c.relpersistence = 'u'
587 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
588 ORDER BY n.nspname, c.relname;
591 if [[ -n "$unlogged_tables" ]]; then
592 log_warning "The following unlogged tables will NOT be backed up:"
593 while IFS= read -r table; do
594 log_warning " - $table"
595 done <<< "$unlogged_tables"
599 # Warning: Check for large objects
600 log_info "Checking for large objects..."
601 local large_object_count
602 large_object_count=$(query_db "SELECT count(*) FROM pg_largeobject_metadata;")
604 if [[ "$large_object_count" -gt 0 ]]; then
605 log_warning "Database contains $large_object_count large objects"
606 log_warning "Large objects are NOT incrementally backed up (only in full backups)"
607 log_warning "Consider using BYTEA columns instead for incremental backup support"
611 # Check if validation failed
612 if [[ "$validation_failed" -eq 1 ]]; then
613 if [[ "$FORCE" -eq 1 ]]; then
614 log_warning "Validation failed but --force specified, continuing anyway..."
616 log_error "Validation failed. Fix the CRITICAL issues above and try again."
617 log_error "Or use --force to skip validation (NOT recommended)."
618 exit "$EXIT_VALIDATION_ERROR"
621 log_success "All validation checks passed"
625 log_step "Phase 2: Setup"
627 # Create backup directory
628 log_info "Checking backup directory..."
629 if [[ ! -d "$FILE" ]]; then
630 if ! mkdir -p "$FILE"; then
631 log_error "Failed to create backup directory: $FILE"
632 exit "$EXIT_BACKUP_ERROR"
634 log_success "Created backup directory: $FILE"
636 # Directory exists - check if already initialized
637 if [[ -f "$FILE/pg_scribe_metadata.txt" ]]; then
638 log_error "Backup directory already initialized: $FILE"
639 log_error "Metadata file exists: $FILE/pg_scribe_metadata.txt"
641 log_error "This directory has already been initialized with pg_scribe."
642 log_error "To take an additional full backup, use: pg_scribe --full-backup"
644 log_error "If you want to re-initialize from scratch:"
645 log_error " 1. Stop any running backup processes"
646 log_error " 2. Drop the replication slot (or verify it's safe to reuse)"
647 log_error " 3. Remove or rename the existing backup directory"
648 exit "$EXIT_VALIDATION_ERROR"
651 # Directory exists but not initialized - check if empty
652 if [[ -n "$(ls -A "$FILE" 2>/dev/null)" ]]; then
653 log_error "Backup directory is not empty: $FILE"
654 log_error "The backup directory must be empty for initialization."
655 log_error "Found existing files:"
656 # shellcheck disable=SC2012 # ls used for user-friendly display, not processing
657 ls -lh "$FILE" | head -10 >&2
658 exit "$EXIT_VALIDATION_ERROR"
661 log_info "Using existing empty directory: $FILE"
664 # Create wal2sql extension
665 log_info "Creating wal2sql extension..."
666 if query_db_silent "CREATE EXTENSION IF NOT EXISTS wal2sql;"; then
667 log_success "wal2sql extension created (or already exists)"
669 log_error "Failed to create wal2sql extension"
670 log_error "Ensure wal2sql.so is installed in PostgreSQL's lib directory"
671 log_error "Run: cd wal2sql && make && make install"
672 exit "$EXIT_GENERAL_ERROR"
675 # Create replication slot with snapshot export
676 log_info "Creating logical replication slot '$SLOT'..."
678 # Check if slot already exists
680 slot_exists=$(query_db "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$SLOT';")
682 if [[ "$slot_exists" -gt 0 ]]; then
683 log_error "Replication slot '$SLOT' already exists"
685 log_error "A replication slot with this name already exists in the database."
686 log_error "This may indicate:"
687 log_error " - A previous initialization that was not cleaned up"
688 log_error " - Another pg_scribe instance using the same slot name"
690 log_error "To resolve:"
691 log_error " - Use a different slot name with -S/--slot option"
692 log_error " - Or drop the existing slot (if safe):"
693 log_error " psql -d $DBNAME -c \"SELECT pg_drop_replication_slot('$SLOT');\""
694 exit "$EXIT_SLOT_ERROR"
697 # Create slot using SQL
698 # Note: For POC, we create the slot and take the base backup sequentially
699 # The slot will preserve WAL from its creation LSN forward, ensuring no changes are lost
701 if ! slot_result=$(query_db "SELECT slot_name, lsn FROM pg_create_logical_replication_slot('$SLOT', 'wal2sql');"); then
702 log_error "Failed to create replication slot"
703 log_error "$slot_result"
704 exit "$EXIT_SLOT_ERROR"
707 CREATED_SLOT="$SLOT" # Track for cleanup
708 log_success "Replication slot '$SLOT' created"
710 # Take base backup immediately after slot creation
711 # The slot preserves WAL from its creation point, so all changes will be captured
713 timestamp=$(get_backup_timestamp)
714 local base_backup_file="$FILE/base-${timestamp}.sql"
715 CREATED_FILES+=("$base_backup_file") # Track for cleanup
716 log_info "Taking base backup: $base_backup_file"
719 mapfile -t psql_args < <(build_psql_args)
720 if pg_dump "${psql_args[@]}" --file="$base_backup_file"; then
721 log_success "Base backup completed: $base_backup_file"
723 log_error "Base backup failed"
724 exit "$EXIT_BACKUP_ERROR"
727 # Take globals backup (using same timestamp for consistency)
728 local globals_backup_file
729 globals_backup_file=$(take_globals_backup "$FILE" "$timestamp")
730 CREATED_FILES+=("$globals_backup_file") # Track for cleanup
732 # Generate metadata file
733 log_info "Generating metadata file..."
734 local metadata_file="$FILE/pg_scribe_metadata.txt"
735 CREATED_FILES+=("$metadata_file") # Track for cleanup
737 pg_version=$(query_db "SELECT version();")
739 cat > "$metadata_file" <<EOF
740 pg_scribe Backup System Metadata
741 =================================
743 Generated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
744 pg_scribe Version: $VERSION
750 Replication Slot: $SLOT
753 $(query_db "SELECT extname || ' ' || extversion FROM pg_extension ORDER BY extname;")
755 Encoding: $(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")
757 Collation: $(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")
760 log_success "Metadata file created: $metadata_file"
762 # Disable cleanup trap on successful completion
767 log_step "Initialization Complete"
768 log_success "Backup directory: $FILE"
769 log_success "Replication slot: $SLOT"
770 log_info "Next steps:"
771 log_info " 1. Start streaming incremental backups:"
772 log_info " pg_scribe --start -d $DBNAME -f $FILE/incremental.sql -S $SLOT"
773 log_info " 2. Monitor replication slot health:"
774 log_info " pg_scribe --status -d $DBNAME -S $SLOT"
776 if [[ "$has_warnings" -eq 1 ]]; then
784 # --start command implementation
787 log_step "Starting incremental backup collection"
789 # Validate required arguments
790 if [[ -z "$DBNAME" ]]; then
791 log_error "--start requires -d/--dbname"
792 exit "$EXIT_VALIDATION_ERROR"
795 if [[ -z "$FILE" ]]; then
796 log_error "--start requires -f/--file (output file, or '-' for stdout)"
797 exit "$EXIT_VALIDATION_ERROR"
803 # Verify replication slot exists
804 log_step "Verifying replication slot '$SLOT'..."
806 slot_exists=$(query_db "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$SLOT';")
808 if [[ "$slot_exists" -eq 0 ]]; then
809 log_error "Replication slot '$SLOT' does not exist"
811 log_error "You must initialize the backup system first:"
812 log_error " pg_scribe --init -d $DBNAME -f <backup_dir> -S $SLOT"
814 log_error "Or verify the slot name is correct with:"
815 log_error " psql -d $DBNAME -c \"SELECT slot_name FROM pg_replication_slots;\""
816 exit "$EXIT_SLOT_ERROR"
819 log_success "Replication slot '$SLOT' found"
821 # Build pg_recvlogical arguments
822 local pg_recv_args=()
823 mapfile -t pg_recv_args < <(build_pg_recvlogical_args)
825 # Add required arguments
826 pg_recv_args+=(--slot="$SLOT")
827 pg_recv_args+=(--start)
828 pg_recv_args+=(--file="$FILE")
831 pg_recv_args+=(--option=include_transaction=on)
833 # Add status interval
834 pg_recv_args+=(--status-interval="$STATUS_INTERVAL")
836 # Add fsync interval (0 means disabled)
837 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
838 pg_recv_args+=(--fsync-interval="$FSYNC_INTERVAL")
840 # For fsync-interval=0, we skip the parameter to avoid pg_recvlogical errors
841 log_info "Fsync disabled (fsync-interval=0)"
844 # Display configuration
845 log_step "Configuration"
846 log_info "Database: $DBNAME"
847 log_info "Replication slot: $SLOT"
848 log_info "Output file: $FILE"
849 log_info "Status interval: ${STATUS_INTERVAL}s"
850 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
851 log_info "Fsync interval: ${FSYNC_INTERVAL}s"
853 log_info "Fsync: disabled"
857 # Start streaming - replace this process with pg_recvlogical
858 log_step "Starting streaming replication..."
859 log_info "Press Ctrl+C to stop"
860 log_info "Send SIGHUP to rotate output file"
863 # Replace this process with pg_recvlogical
864 # This eliminates signal forwarding issues and prevents orphaned processes
865 # The PID stays the same, making cleanup in tests more reliable
866 exec pg_recvlogical "${pg_recv_args[@]}"
870 # --full-backup command implementation
873 log_step "Taking full backup"
875 # Validate required arguments
876 if [[ -z "$DBNAME" ]]; then
877 log_error "--full-backup requires -d/--dbname"
878 exit "$EXIT_VALIDATION_ERROR"
881 if [[ -z "$FILE" ]]; then
882 log_error "--full-backup requires -f/--file (backup directory)"
883 exit "$EXIT_VALIDATION_ERROR"
889 # Ensure backup directory exists
890 if [[ ! -d "$FILE" ]]; then
891 log_error "Backup directory does not exist: $FILE"
892 log_error "Create the directory first or run --init to initialize the backup system"
893 exit "$EXIT_BACKUP_ERROR"
896 # Set compression method (default: gzip)
897 local compress_method="${COMPRESS:-gzip}"
898 if [[ "$compress_method" == "none" ]]; then
902 # Generate timestamped filenames
904 timestamp=$(get_backup_timestamp)
905 local base_backup_file="$FILE/base-${timestamp}.sql"
907 # Add compression extension to base backup if applicable
908 # Note: We don't compress globals since it's typically very small (< 1KB)
909 if [[ -n "$compress_method" ]]; then
910 # Extract compression type (before colon)
911 local compress_type="${compress_method%%:*}"
912 case "$compress_type" in
914 base_backup_file="${base_backup_file}.gz"
917 base_backup_file="${base_backup_file}.lz4"
920 base_backup_file="${base_backup_file}.zst"
923 log_error "Unknown compression method: $compress_type"
924 log_error "Supported methods: gzip, lz4, zstd, none"
925 exit "$EXIT_VALIDATION_ERROR"
931 log_info "Taking base backup: $base_backup_file"
932 if [[ -n "$compress_method" ]]; then
933 log_info "Compression: $compress_method"
937 mapfile -t psql_args < <(build_psql_args)
939 # Build pg_dump command
940 local pg_dump_args=("${psql_args[@]}")
941 if [[ -n "$compress_method" ]]; then
942 pg_dump_args+=(--compress="$compress_method")
944 pg_dump_args+=(--file="$base_backup_file")
946 if pg_dump "${pg_dump_args[@]}"; then
948 backup_size=$(du -h "$base_backup_file" 2>/dev/null | cut -f1)
949 log_success "Base backup completed: $base_backup_file ($backup_size)"
951 log_error "Base backup failed"
952 # Clean up partial file
953 rm -f "$base_backup_file" 2>/dev/null || true
954 exit "$EXIT_BACKUP_ERROR"
957 # Take globals backup (uncompressed - typically < 1KB, not worth compressing)
958 local globals_backup_file
959 globals_backup_file=$(take_globals_backup "$FILE" "$timestamp")
961 # Generate/update metadata file
962 log_info "Updating metadata file..."
963 local metadata_file="$FILE/pg_scribe_metadata.txt"
965 pg_version=$(query_db "SELECT version();")
967 cat > "$metadata_file" <<EOF
968 pg_scribe Backup System Metadata
969 =================================
971 Last Updated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
972 pg_scribe Version: $VERSION
980 Base: $(basename "$base_backup_file")
981 Globals: $(basename "$globals_backup_file")
982 Timestamp: $timestamp
985 $(query_db "SELECT extname || ' ' || extversion FROM pg_extension ORDER BY extname;")
987 Encoding: $(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")
989 Collation: $(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")
992 log_success "Metadata file updated: $metadata_file"
996 log_step "Full Backup Complete"
997 log_success "Base backup: $base_backup_file"
998 log_success "Globals backup: $globals_backup_file"
999 log_success "Backup directory: $FILE"
1001 exit "$EXIT_SUCCESS"
1005 # --restore command implementation
1008 log_step "Restoring database from backup"
1010 # Validate required arguments
1011 if [[ -z "$DBNAME" ]]; then
1012 log_error "--restore requires -d/--dbname (target database)"
1013 exit "$EXIT_VALIDATION_ERROR"
1016 if [[ -z "$FILE" ]]; then
1017 log_error "--restore requires -f/--file (backup directory)"
1018 exit "$EXIT_VALIDATION_ERROR"
1021 # Verify backup directory exists
1022 if [[ ! -d "$FILE" ]]; then
1023 log_error "Backup directory does not exist: $FILE"
1024 exit "$EXIT_BACKUP_ERROR"
1028 log_step "Locating backups"
1029 local base_backup_path=""
1031 if [[ -n "$BASE_BACKUP" ]]; then
1032 # Use specified base backup
1033 if [[ ! -f "$BASE_BACKUP" ]]; then
1034 log_error "Specified base backup not found: $BASE_BACKUP"
1035 exit "$EXIT_BACKUP_ERROR"
1037 base_backup_path="$BASE_BACKUP"
1038 log_info "Using specified base backup: $(basename "$base_backup_path")"
1040 # Find latest base backup (uncompressed or compressed)
1042 latest_base=$(find "$FILE" -maxdepth 1 \( -name 'base-*.sql' -o -name 'base-*.sql.gz' -o -name 'base-*.sql.zst' -o -name 'base-*.sql.lz4' \) -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1044 if [[ -z "$latest_base" ]]; then
1045 log_error "No base backup found in directory: $FILE"
1046 log_error "Run --init or --full-backup first to create a base backup"
1047 exit "$EXIT_BACKUP_ERROR"
1050 base_backup_path="$latest_base"
1051 log_info "Found base backup: $(basename "$base_backup_path")"
1054 # Extract timestamp from base backup filename for finding matching globals
1055 local base_timestamp
1056 base_timestamp=$(basename "$base_backup_path" | sed -E 's/base-([0-9]{8}-[0-9]{6}).*/\1/')
1058 # Find matching globals backup
1059 local globals_backup_path
1060 globals_backup_path=$(find "$FILE" -maxdepth 1 -name "globals-${base_timestamp}.sql" 2>/dev/null | head -1)
1062 if [[ -z "$globals_backup_path" ]]; then
1063 # Try to find any globals backup as fallback
1064 globals_backup_path=$(find "$FILE" -maxdepth 1 -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1066 if [[ -n "$globals_backup_path" ]]; then
1067 log_warning "Exact matching globals backup not found, using: $(basename "$globals_backup_path")"
1069 log_warning "No globals backup found (roles and tablespaces will not be restored)"
1072 log_info "Found globals backup: $(basename "$globals_backup_path")"
1075 # Find incremental backups (if any)
1076 local incremental_files=()
1077 mapfile -t incremental_files < <(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -n | cut -d' ' -f2-)
1079 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1080 log_info "Found ${#incremental_files[@]} incremental backup file(s)"
1082 log_info "No incremental backup files found (will restore base backup only)"
1085 # Create target database if requested
1086 if [[ "$CREATE_DB" -eq 1 ]]; then
1087 log_step "Creating target database"
1089 # Connect to postgres database (not target database) to create it
1090 local create_dbname="$DBNAME"
1093 # Test connection to postgres database
1096 # Check if database already exists
1098 db_exists=$(query_db "SELECT count(*) FROM pg_database WHERE datname = '$create_dbname';")
1100 if [[ "$db_exists" -gt 0 ]]; then
1101 log_error "Database '$create_dbname' already exists"
1102 log_error "Drop it first or omit --create flag to restore into existing database"
1103 exit "$EXIT_BACKUP_ERROR"
1107 if query_db_silent "CREATE DATABASE \"$create_dbname\";"; then
1108 log_success "Created database: $create_dbname"
1110 log_error "Failed to create database: $create_dbname"
1111 exit "$EXIT_BACKUP_ERROR"
1114 # Switch back to target database for subsequent operations
1115 DBNAME="$create_dbname"
1118 # Test connection to target database
1121 # Restore globals backup
1122 if [[ -n "$globals_backup_path" ]]; then
1123 log_step "Restoring globals (roles, tablespaces)"
1125 # Build connection args for psql
1126 # Note: globals must be restored to postgres database, not target database
1127 local save_dbname="$DBNAME"
1130 mapfile -t psql_args < <(build_psql_args)
1131 DBNAME="$save_dbname"
1133 if psql "${psql_args[@]}" -f "$globals_backup_path" >/dev/null 2>&1; then
1134 log_success "Globals restored successfully"
1136 log_warning "Globals restore had errors (may be expected if roles already exist)"
1140 # Restore base backup
1141 log_step "Restoring base backup"
1143 start_time=$(date +%s)
1146 mapfile -t psql_args < <(build_psql_args)
1148 # Handle compressed backups
1150 if [[ "$base_backup_path" == *.gz ]]; then
1151 log_info "Decompressing gzip backup..."
1152 if gunzip -c "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1153 log_success "Base backup restored successfully"
1155 log_error "Base backup restore failed"
1156 exit "$EXIT_BACKUP_ERROR"
1158 elif [[ "$base_backup_path" == *.zst ]]; then
1159 log_info "Decompressing zstd backup..."
1160 if zstd -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1161 log_success "Base backup restored successfully"
1163 log_error "Base backup restore failed"
1164 exit "$EXIT_BACKUP_ERROR"
1166 elif [[ "$base_backup_path" == *.lz4 ]]; then
1167 log_info "Decompressing lz4 backup..."
1168 if lz4 -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1169 log_success "Base backup restored successfully"
1171 log_error "Base backup restore failed"
1172 exit "$EXIT_BACKUP_ERROR"
1175 # Uncompressed backup
1176 if psql "${psql_args[@]}" -f "$base_backup_path" >/dev/null 2>&1; then
1177 log_success "Base backup restored successfully"
1179 log_error "Base backup restore failed"
1180 exit "$EXIT_BACKUP_ERROR"
1184 # Apply incremental backups
1185 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1186 log_step "Applying incremental backups"
1188 for inc_file in "${incremental_files[@]}"; do
1189 log_info "Applying: $(basename "$inc_file")"
1191 if psql "${psql_args[@]}" -f "$inc_file" >/dev/null 2>&1; then
1192 if [[ "$VERBOSE" -eq 1 ]]; then
1193 log_success "Applied: $(basename "$inc_file")"
1196 log_error "Failed to apply incremental backup: $(basename "$inc_file")"
1197 log_error "Restore is incomplete"
1198 exit "$EXIT_BACKUP_ERROR"
1202 log_success "All incremental backups applied successfully"
1205 # Synchronize sequences
1206 if [[ "$NO_SYNC_SEQUENCES" -eq 0 ]]; then
1207 log_step "Synchronizing sequences"
1209 # Query all sequences and their associated tables
1211 seq_sync_sql=$(query_db "
1214 quote_literal(sn.nspname || '.' || s.relname) ||
1215 ', GREATEST((SELECT COALESCE(MAX(' ||
1216 quote_ident(a.attname) ||
1218 quote_ident(tn.nspname) || '.' || quote_ident(t.relname) ||
1221 JOIN pg_namespace sn ON sn.oid = s.relnamespace
1222 JOIN pg_depend d ON d.objid = s.oid AND d.deptype = 'a'
1223 JOIN pg_class t ON t.oid = d.refobjid
1224 JOIN pg_namespace tn ON tn.oid = t.relnamespace
1225 JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = d.refobjsubid
1226 WHERE s.relkind = 'S'
1227 AND sn.nspname NOT IN ('pg_catalog', 'information_schema')
1228 ORDER BY sn.nspname, s.relname;
1231 if [[ -n "$seq_sync_sql" ]]; then
1233 while IFS= read -r sync_cmd; do
1234 if query_db_silent "$sync_cmd"; then
1235 seq_count=$((seq_count + 1))
1236 if [[ "$VERBOSE" -eq 1 ]]; then
1237 log_info "Synced sequence: $(echo "$sync_cmd" | grep -oP "'\K[^']+(?=')")"
1240 log_warning "Failed to sync sequence: $sync_cmd"
1242 done <<< "$seq_sync_sql"
1244 log_success "Synchronized $seq_count sequence(s)"
1246 log_info "No sequences found to synchronize"
1249 log_info "Skipping sequence synchronization (--no-sync-sequences specified)"
1252 # Calculate restore duration
1254 end_time=$(date +%s)
1255 local duration=$((end_time - start_time))
1258 log_step "Restore Statistics"
1260 # Count rows in all tables
1261 log_info "Counting rows in restored tables..."
1263 table_count=$(query_db "SELECT count(*) FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relkind = 'r' AND n.nspname NOT IN ('pg_catalog', 'information_schema');" 2>/dev/null)
1266 total_rows=$(query_db "
1267 SELECT COALESCE(SUM(n_live_tup), 0)
1268 FROM pg_stat_user_tables;
1271 echo -e "${BOLD}Database:${RESET} $DBNAME" >&2
1272 echo -e "${BOLD}Tables Restored:${RESET} $table_count" >&2
1273 echo -e "${BOLD}Total Rows:${RESET} $total_rows (approximate)" >&2
1274 echo -e "${BOLD}Duration:${RESET} ${duration}s" >&2
1275 echo -e "${BOLD}Base Backup:${RESET} $(basename "$base_backup_path")" >&2
1277 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1278 echo -e "${BOLD}Incremental Files:${RESET} ${#incremental_files[@]}" >&2
1281 # Final success message
1283 log_step "Restore Complete"
1284 log_success "Database successfully restored to: $DBNAME"
1285 log_info "Next steps:"
1286 log_info " 1. Verify data integrity:"
1287 log_info " psql -d $DBNAME -c 'SELECT COUNT(*) FROM <your_table>;'"
1288 log_info " 2. Run application smoke tests"
1289 log_info " 3. Switch application to restored database"
1291 exit "$EXIT_SUCCESS"
1295 # --status command implementation
1298 log_step "Checking pg_scribe backup system status"
1300 # Validate required arguments
1301 if [[ -z "$DBNAME" ]]; then
1302 log_error "--status requires -d/--dbname"
1303 exit "$EXIT_VALIDATION_ERROR"
1309 # Track warnings for exit code
1310 local has_warnings=0
1312 # Check replication slot status
1313 log_step "Replication Slot Status"
1315 # Verify replication slot exists
1317 slot_exists=$(query_db "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$SLOT';")
1319 if [[ "$slot_exists" -eq 0 ]]; then
1320 log_error "Replication slot '$SLOT' does not exist"
1322 log_error "Initialize the backup system first:"
1323 log_error " pg_scribe --init -d $DBNAME -f <backup_dir> -S $SLOT"
1324 exit "$EXIT_SLOT_ERROR"
1327 # Query slot details
1329 slot_info=$(query_db "
1336 confirmed_flush_lsn,
1337 pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) as restart_lag_bytes,
1338 pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) as confirmed_lag_bytes,
1339 pg_current_wal_lsn() as current_lsn
1340 FROM pg_replication_slots
1341 WHERE slot_name = '$SLOT';
1345 IFS='|' read -r slot_name slot_type db_name active restart_lsn confirmed_flush_lsn restart_lag_bytes confirmed_lag_bytes current_lsn <<< "$slot_info"
1347 # Display slot information
1348 echo -e "${BOLD}Slot Name:${RESET} $slot_name" >&2
1349 echo -e "${BOLD}Slot Type:${RESET} $slot_type" >&2
1350 echo -e "${BOLD}Database:${RESET} $db_name" >&2
1352 if [[ "$active" == "t" ]]; then
1353 echo -e "${BOLD}Active:${RESET} ${GREEN}Yes${RESET}" >&2
1355 echo -e "${BOLD}Active:${RESET} ${YELLOW}No${RESET}" >&2
1356 log_warning "Replication slot is not active"
1360 echo -e "${BOLD}Current WAL LSN:${RESET} $current_lsn" >&2
1361 echo -e "${BOLD}Restart LSN:${RESET} $restart_lsn" >&2
1362 echo -e "${BOLD}Confirmed LSN:${RESET} $confirmed_flush_lsn" >&2
1364 # Format lag in human-readable sizes
1365 local restart_lag_mb=$((restart_lag_bytes / 1024 / 1024))
1366 local confirmed_lag_mb=$((confirmed_lag_bytes / 1024 / 1024))
1368 # Check lag thresholds (based on design doc)
1369 if [[ "$restart_lag_bytes" -gt 10737418240 ]]; then
1371 echo -e "${BOLD}Restart Lag:${RESET} ${RED}${restart_lag_mb} MB (CRITICAL!)${RESET}" >&2
1372 log_error "CRITICAL: Replication lag exceeds 10GB!"
1373 log_error " This may cause disk space issues or database shutdown"
1374 log_error " Consider dropping the slot if backup collection has stopped"
1376 elif [[ "$restart_lag_bytes" -gt 1073741824 ]]; then
1378 echo -e "${BOLD}Restart Lag:${RESET} ${YELLOW}${restart_lag_mb} MB (WARNING)${RESET}" >&2
1379 log_warning "Replication lag exceeds 1GB"
1380 log_warning " Ensure backup collection is running and healthy"
1383 echo -e "${BOLD}Restart Lag:${RESET} ${GREEN}${restart_lag_mb} MB${RESET}" >&2
1386 if [[ "$confirmed_lag_bytes" -gt 10737418240 ]]; then
1387 echo -e "${BOLD}Confirmed Lag:${RESET} ${RED}${confirmed_lag_mb} MB (CRITICAL!)${RESET}" >&2
1389 elif [[ "$confirmed_lag_bytes" -gt 1073741824 ]]; then
1390 echo -e "${BOLD}Confirmed Lag:${RESET} ${YELLOW}${confirmed_lag_mb} MB (WARNING)${RESET}" >&2
1393 echo -e "${BOLD}Confirmed Lag:${RESET} ${GREEN}${confirmed_lag_mb} MB${RESET}" >&2
1396 # Check slot age (if we can determine it)
1397 # Note: pg_replication_slots doesn't directly track creation time, but we can estimate from WAL
1400 # Analyze backup directory if provided
1401 if [[ -n "$FILE" ]]; then
1402 log_step "Backup Directory Analysis"
1404 if [[ ! -d "$FILE" ]]; then
1405 log_warning "Backup directory does not exist: $FILE"
1408 # Count base backups
1410 base_count=$(find "$FILE" -maxdepth 1 -name 'base-*.sql' 2>/dev/null | wc -l)
1411 echo -e "${BOLD}Base Backups:${RESET} $base_count" >&2
1413 if [[ "$base_count" -gt 0 ]]; then
1414 # Show latest base backup
1416 latest_base=$(find "$FILE" -maxdepth 1 -name 'base-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1418 base_name=$(basename "$latest_base")
1420 base_date=$(stat -c %y "$latest_base" 2>/dev/null | cut -d. -f1)
1421 echo -e "${BOLD}Latest Base:${RESET} $base_name ($base_date)" >&2
1423 # Show base backup size
1425 base_size=$(du -h "$latest_base" 2>/dev/null | cut -f1)
1426 echo -e "${BOLD}Base Size:${RESET} $base_size" >&2
1428 log_warning "No base backups found in directory"
1432 # Count globals backups
1434 globals_count=$(find "$FILE" -maxdepth 1 -name 'globals-*.sql' 2>/dev/null | wc -l)
1435 echo -e "${BOLD}Globals Backups:${RESET} $globals_count" >&2
1437 # Check for incremental backup files
1438 # Note: Incremental files are created by --start command
1439 # They may have various names depending on configuration
1440 local incremental_count
1441 incremental_count=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' 2>/dev/null | wc -l)
1443 if [[ "$incremental_count" -gt 0 ]]; then
1444 echo -e "${BOLD}Incremental Files:${RESET} $incremental_count" >&2
1446 # Calculate total size of incremental files
1447 local incremental_size
1448 incremental_size=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -exec du -ch {} + 2>/dev/null | grep total | cut -f1)
1449 echo -e "${BOLD}Incremental Size:${RESET} $incremental_size" >&2
1451 # Show most recent incremental file
1452 local latest_incremental
1453 latest_incremental=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1454 if [[ -n "$latest_incremental" ]]; then
1456 inc_name=$(basename "$latest_incremental")
1458 inc_date=$(stat -c %y "$latest_incremental" 2>/dev/null | cut -d. -f1)
1459 local inc_age_seconds
1460 inc_age_seconds=$(( $(date +%s) - $(stat -c %Y "$latest_incremental" 2>/dev/null) ))
1461 local inc_age_minutes=$((inc_age_seconds / 60))
1463 echo -e "${BOLD}Latest Incremental:${RESET} $inc_name ($inc_date)" >&2
1465 # Warn if last incremental is old
1466 if [[ "$inc_age_minutes" -gt 60 ]]; then
1467 log_warning "Last incremental backup is ${inc_age_minutes} minutes old"
1468 log_warning " Verify that backup collection (--start) is running"
1473 log_warning "No incremental backup files found"
1474 log_warning " Start incremental backup collection with: pg_scribe --start"
1478 # Check for metadata file
1479 if [[ -f "$FILE/pg_scribe_metadata.txt" ]]; then
1480 echo -e "${BOLD}Metadata File:${RESET} Present" >&2
1482 # Extract some metadata
1483 local pg_version_line
1484 pg_version_line=$(grep "PostgreSQL" "$FILE/pg_scribe_metadata.txt" 2>/dev/null | head -1)
1485 if [[ -n "$pg_version_line" ]]; then
1486 echo -e "${BOLD}Backup PG Version:${RESET} $pg_version_line" >&2
1489 log_warning "Metadata file not found"
1493 # Calculate total backup directory size
1495 total_size=$(du -sh "$FILE" 2>/dev/null | cut -f1)
1496 echo -e "${BOLD}Total Directory Size:${RESET} $total_size" >&2
1500 # Overall health summary
1502 log_step "Health Summary"
1504 if [[ "$has_warnings" -eq 0 ]]; then
1505 log_success "System is healthy"
1507 log_info "Replication slot is active and lag is acceptable"
1508 if [[ -n "$FILE" ]]; then
1509 log_info "Backup directory appears healthy"
1511 exit "$EXIT_SUCCESS"
1513 log_warning "System has warnings - review messages above"
1515 log_info "Address any CRITICAL or WARNING issues promptly"
1516 log_info "See design doc for monitoring recommendations"
1517 exit "$EXIT_WARNING"
1542 log_error "Unknown action: $ACTION"
1543 exit "$EXIT_GENERAL_ERROR"
1548 # Run main with all arguments