3 # pg_scribe - Incremental SQL backup system for PostgreSQL
5 # This script provides a unified CLI for managing PostgreSQL backups
6 # using logical replication and plain SQL format.
16 EXIT_CONNECTION_ERROR=2
19 EXIT_VALIDATION_ERROR=5
23 DEFAULT_SLOT="pg_scribe"
25 DEFAULT_HOST="localhost"
26 DEFAULT_STATUS_INTERVAL=10
27 DEFAULT_FSYNC_INTERVAL=10
32 HOST="${PGHOST:-$DEFAULT_HOST}"
33 PORT="${PGPORT:-$DEFAULT_PORT}"
34 USERNAME="${PGUSER:-${USER:-}}"
37 STATUS_INTERVAL="$DEFAULT_STATUS_INTERVAL"
38 FSYNC_INTERVAL="$DEFAULT_FSYNC_INTERVAL"
48 # Color output support
49 if [[ "${PG_COLOR:-auto}" == "always" ]] || [[ "${PG_COLOR:-auto}" == "auto" && -t 2 ]]; then
65 # Logging functions (output to stderr)
67 echo -e "${BLUE}INFO:${RESET} $*" >&2
71 echo -e "${GREEN}SUCCESS:${RESET} $*" >&2
75 echo -e "${YELLOW}WARNING:${RESET} $*" >&2
79 echo -e "${RED}ERROR:${RESET} $*" >&2
83 echo -e "${BOLD}==>${RESET} $*" >&2
89 pg_scribe - Incremental SQL backup system for PostgreSQL
92 pg_scribe --init [OPTIONS]
93 pg_scribe --start [OPTIONS]
94 pg_scribe --full-backup [OPTIONS]
95 pg_scribe --restore [OPTIONS]
96 pg_scribe --status [OPTIONS]
100 Actions (exactly one required):
101 --init Initialize backup system
102 --start Start streaming incremental backups
103 --full-backup Take a full backup
104 --restore Restore from backups
105 --status Check replication slot status
106 -V, --version Print version and exit
107 -?, --help Show this help and exit
110 -d, --dbname=DBNAME Database name (can be connection string)
111 -h, --host=HOSTNAME Database server host (default: $DEFAULT_HOST)
112 -p, --port=PORT Database server port (default: $DEFAULT_PORT)
113 -U, --username=NAME Database user (default: \$PGUSER or \$USER)
114 -w, --no-password Never prompt for password
115 -W, --password Force password prompt
118 -v, --verbose Enable verbose mode
121 -f, --file=DIRECTORY Backup output directory (required)
122 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
123 --force Skip validation and force initialization
126 -f, --file=FILENAME Output file (use '-' for stdout, required)
127 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
128 -s, --status-interval=SECS Status update interval (default: $DEFAULT_STATUS_INTERVAL)
129 -F, --fsync-interval=SECS Fsync interval (default: $DEFAULT_FSYNC_INTERVAL, 0 to disable)
131 Options for --full-backup:
132 -f, --file=DIRECTORY Backup output directory (required)
133 -Z, --compress=METHOD Compression: gzip, lz4, zstd, or none (default: gzip)
135 Options for --restore:
136 -f, --file=DIRECTORY Backup input directory (required)
137 -d, --dbname=DBNAME Target database name (required)
138 -C, --create Create target database
139 --base-backup=FILE Specific base backup file (default: latest)
140 --no-sync-sequences Skip sequence synchronization
142 Options for --status:
143 -S, --slot=SLOTNAME Replication slot name (default: $DEFAULT_SLOT)
144 -f, --file=DIRECTORY Backup directory to analyze (optional)
149 2 Database connection error
150 3 Replication slot error
151 4 Backup/restore error
152 5 Invalid arguments or validation failure
154 Environment Variables:
155 PGHOST, PGPORT, PGDATABASE, PGUSER, PGPASSWORD, PG_COLOR
157 Report bugs to: https://github.com/your-repo/pg_scribe/issues
161 # Parse command line arguments
163 if [[ $# -eq 0 ]]; then
165 exit "$EXIT_VALIDATION_ERROR"
168 while [[ $# -gt 0 ]]; do
171 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
176 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
181 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
186 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
191 [[ -n "$ACTION" ]] && { log_error "Multiple action flags specified"; exit "$EXIT_VALIDATION_ERROR"; }
196 echo "pg_scribe $VERSION"
251 -s|--status-interval)
256 STATUS_INTERVAL="${1#*=}"
264 FSYNC_INTERVAL="${1#*=}"
284 BASE_BACKUP="${1#*=}"
308 log_error "Unknown option: $1"
310 exit "$EXIT_VALIDATION_ERROR"
315 # Validate action was specified
316 if [[ -z "$ACTION" ]]; then
317 log_error "No action specified"
319 exit "$EXIT_VALIDATION_ERROR"
322 # Use PGDATABASE if dbname not specified
323 if [[ -z "$DBNAME" && -n "${PGDATABASE:-}" ]]; then
328 # Build psql connection string
332 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
333 [[ -n "$HOST" ]] && args+=(-h "$HOST")
334 [[ -n "$PORT" ]] && args+=(-p "$PORT")
335 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
336 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
337 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
339 printf '%s\n' "${args[@]}"
342 # Build pg_recvlogical connection string
343 build_pg_recvlogical_args() {
346 [[ -n "$DBNAME" ]] && args+=(-d "$DBNAME")
347 [[ -n "$HOST" ]] && args+=(-h "$HOST")
348 [[ -n "$PORT" ]] && args+=(-p "$PORT")
349 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
350 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
351 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
353 printf '%s\n' "${args[@]}"
356 # Build pg_dumpall connection arguments (no -d flag)
357 build_pg_dumpall_args() {
360 [[ -n "$HOST" ]] && args+=(-h "$HOST")
361 [[ -n "$PORT" ]] && args+=(-p "$PORT")
362 [[ -n "$USERNAME" ]] && args+=(-U "$USERNAME")
363 [[ "$NO_PASSWORD" -eq 1 ]] && args+=(-w)
364 [[ "$FORCE_PASSWORD" -eq 1 ]] && args+=(-W)
366 printf '%s\n' "${args[@]}"
369 # Generate standardized backup timestamp
370 get_backup_timestamp() {
374 # Get human-readable file size
378 # Echoes the file size in human-readable format (e.g., "1.2M", "5.4K")
381 du -h "$file_path" 2>/dev/null | cut -f1
384 # Test database connection
386 log_step "Testing database connection..."
389 mapfile -t psql_args < <(build_psql_args)
391 if ! psql "${psql_args[@]}" -c "SELECT version();" >/dev/null 2>&1; then
392 log_error "Failed to connect to database"
393 log_error "Connection details: host=$HOST port=$PORT dbname=$DBNAME user=$USERNAME"
394 exit "$EXIT_CONNECTION_ERROR"
397 if [[ "$VERBOSE" -eq 1 ]]; then
398 log_success "Connected to database"
402 # Execute SQL query and return result
406 mapfile -t psql_args < <(build_psql_args)
407 psql "${psql_args[@]}" -t -A -c "$sql" 2>&1
410 # Execute SQL query silently (return exit code only)
414 mapfile -t psql_args < <(build_psql_args)
415 psql "${psql_args[@]}" -t -A -c "$sql" >/dev/null 2>&1
418 # Take a globals backup (roles, tablespaces, etc.)
420 # $1 - backup directory path
421 # $2 - optional: timestamp (if not provided, generates new one)
423 # Echoes the path to the created globals backup file
424 # Exits script on failure
425 take_globals_backup() {
426 local backup_dir="$1"
427 local timestamp="${2:-$(get_backup_timestamp)}"
428 local globals_backup_file="$backup_dir/globals-${timestamp}.sql"
430 log_info "Taking globals backup: $globals_backup_file"
432 # Build pg_dumpall connection arguments
434 mapfile -t dumpall_args < <(build_pg_dumpall_args)
436 # Add globals-only flag and output file
437 dumpall_args+=(--globals-only)
438 dumpall_args+=(--file="$globals_backup_file")
440 if pg_dumpall "${dumpall_args[@]}"; then
442 globals_size=$(get_file_size "$globals_backup_file")
443 log_success "Globals backup completed: $globals_backup_file ($globals_size)"
444 echo "$globals_backup_file"
446 log_error "Globals backup failed"
447 # Clean up partial file
448 rm -f "$globals_backup_file" 2>/dev/null || true
449 exit "$EXIT_BACKUP_ERROR"
453 # Validate required arguments for a command
454 # Arguments: command_name arg_name:description [arg_name:description ...]
455 # Example: validate_required_args "init" "DBNAME:database" "FILE:backup directory"
456 validate_required_args() {
457 local command_name="$1"
460 local validation_failed=0
462 for arg_spec in "$@"; do
463 local arg_name="${arg_spec%%:*}"
464 local arg_description="${arg_spec#*:}"
466 # Use indirect variable reference to check if argument is set
467 if [[ -z "${!arg_name}" ]]; then
468 log_error "--${command_name} requires ${arg_description}"
473 if [[ "$validation_failed" -eq 1 ]]; then
474 exit "$EXIT_VALIDATION_ERROR"
478 # Check replication slot existence
481 # $2 - should_exist: 1 if slot should exist, 0 if slot should NOT exist
482 # Exits with appropriate error code if expectation is not met
483 check_replication_slot() {
485 local should_exist="$2"
488 slot_exists=$(query_db "SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$slot_name';")
490 if [[ "$should_exist" -eq 0 ]]; then
491 # Slot should NOT exist
492 if [[ "$slot_exists" -gt 0 ]]; then
493 log_error "Replication slot '$slot_name' already exists"
495 log_error "A replication slot with this name already exists in the database."
496 log_error "This may indicate:"
497 log_error " - A previous initialization that was not cleaned up"
498 log_error " - Another pg_scribe instance using the same slot name"
500 log_error "To resolve:"
501 log_error " - Use a different slot name with -S/--slot option"
502 log_error " - Or drop the existing slot (if safe):"
503 log_error " psql -d $DBNAME -c \"SELECT pg_drop_replication_slot('$slot_name');\""
504 exit "$EXIT_SLOT_ERROR"
508 if [[ "$slot_exists" -eq 0 ]]; then
509 log_error "Replication slot '$slot_name' does not exist"
511 log_error "You must initialize the backup system first:"
512 log_error " pg_scribe --init -d $DBNAME -f <backup_dir> -S $slot_name"
514 log_error "Or verify the slot name is correct with:"
515 log_error " psql -d $DBNAME -c \"SELECT slot_name FROM pg_replication_slots;\""
516 exit "$EXIT_SLOT_ERROR"
518 log_success "Replication slot '$slot_name' found"
523 # --init command implementation
526 log_step "Initializing pg_scribe backup system"
528 # Validate required arguments
529 validate_required_args "init" "DBNAME:-d/--dbname" "FILE:-f/--file (backup directory)"
531 # Cleanup tracking for failure handling
532 local CREATED_SLOT=""
533 local CREATED_FILES=()
535 # Cleanup function for handling failures
536 # shellcheck disable=SC2317 # Function called via trap handler
537 cleanup_on_failure() {
540 # Only cleanup on actual failure, not on successful exit
541 if [[ $exit_code -ne 0 && $exit_code -ne $EXIT_WARNING ]]; then
542 log_info "Cleaning up after failed initialization..."
544 # Drop replication slot if we created it
545 if [[ -n "$CREATED_SLOT" ]]; then
546 log_info "Dropping replication slot '$CREATED_SLOT'..."
547 query_db "SELECT pg_drop_replication_slot('$CREATED_SLOT');" 2>/dev/null || true
550 # Remove files we created
551 for file in "${CREATED_FILES[@]}"; do
552 if [[ -f "$file" ]]; then
553 log_info "Removing partial file: $file"
554 rm -f "$file" 2>/dev/null || true
558 log_info "Cleanup complete"
562 # Set up cleanup trap
563 trap cleanup_on_failure EXIT INT TERM
565 # Test connection first
568 # Phase 1: Validation
569 log_step "Phase 1: Validation"
571 local validation_failed=0
575 log_info "Checking wal_level configuration..."
577 wal_level=$(query_db "SHOW wal_level;")
578 if [[ "$wal_level" != "logical" ]]; then
579 log_error "CRITICAL: wal_level is '$wal_level', must be 'logical'"
580 log_error " Fix: Add 'wal_level = logical' to postgresql.conf and restart PostgreSQL"
583 if [[ "$VERBOSE" -eq 1 ]]; then
584 log_success "wal_level = logical"
588 # Check max_replication_slots
589 log_info "Checking max_replication_slots configuration..."
591 max_slots=$(query_db "SHOW max_replication_slots;")
592 if [[ "$max_slots" -lt 1 ]]; then
593 log_error "CRITICAL: max_replication_slots is $max_slots, must be >= 1"
594 log_error " Fix: Add 'max_replication_slots = 10' to postgresql.conf and restart PostgreSQL"
597 if [[ "$VERBOSE" -eq 1 ]]; then
598 log_success "max_replication_slots = $max_slots"
602 # Check max_wal_senders
603 log_info "Checking max_wal_senders configuration..."
605 max_senders=$(query_db "SHOW max_wal_senders;")
606 if [[ "$max_senders" -lt 1 ]]; then
607 log_error "CRITICAL: max_wal_senders is $max_senders, must be >= 1"
608 log_error " Fix: Add 'max_wal_senders = 10' to postgresql.conf and restart PostgreSQL"
611 if [[ "$VERBOSE" -eq 1 ]]; then
612 log_success "max_wal_senders = $max_senders"
616 # Check replica identity on all tables
617 log_info "Checking replica identity for all tables..."
619 bad_tables=$(query_db "
620 SELECT n.nspname || '.' || c.relname
622 JOIN pg_namespace n ON n.oid = c.relnamespace
623 WHERE c.relkind = 'r'
624 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
625 AND c.relreplident IN ('d', 'n')
627 SELECT 1 FROM pg_index i
628 WHERE i.indrelid = c.oid AND i.indisprimary
630 ORDER BY n.nspname, c.relname;
633 if [[ -n "$bad_tables" ]]; then
634 log_error "CRITICAL: The following tables lack adequate replica identity:"
635 while IFS= read -r table; do
636 log_error " - $table"
637 done <<< "$bad_tables"
638 log_error " Fix: Add a primary key or set replica identity:"
639 log_error " ALTER TABLE <table> ADD PRIMARY KEY (id);"
640 log_error " -- OR --"
641 log_error " ALTER TABLE <table> REPLICA IDENTITY FULL;"
644 if [[ "$VERBOSE" -eq 1 ]]; then
645 log_success "All tables have adequate replica identity"
649 # Warning: Check for unlogged tables
650 log_info "Checking for unlogged tables..."
651 local unlogged_tables
652 unlogged_tables=$(query_db "
653 SELECT n.nspname || '.' || c.relname
655 JOIN pg_namespace n ON n.oid = c.relnamespace
656 WHERE c.relkind = 'r'
657 AND c.relpersistence = 'u'
658 AND n.nspname NOT IN ('pg_catalog', 'information_schema')
659 ORDER BY n.nspname, c.relname;
662 if [[ -n "$unlogged_tables" ]]; then
663 log_warning "The following unlogged tables will NOT be backed up:"
664 while IFS= read -r table; do
665 log_warning " - $table"
666 done <<< "$unlogged_tables"
670 # Warning: Check for large objects
671 log_info "Checking for large objects..."
672 local large_object_count
673 large_object_count=$(query_db "SELECT count(*) FROM pg_largeobject_metadata;")
675 if [[ "$large_object_count" -gt 0 ]]; then
676 log_warning "Database contains $large_object_count large objects"
677 log_warning "Large objects are NOT incrementally backed up (only in full backups)"
678 log_warning "Consider using BYTEA columns instead for incremental backup support"
682 # Check if validation failed
683 if [[ "$validation_failed" -eq 1 ]]; then
684 if [[ "$FORCE" -eq 1 ]]; then
685 log_warning "Validation failed but --force specified, continuing anyway..."
687 log_error "Validation failed. Fix the CRITICAL issues above and try again."
688 log_error "Or use --force to skip validation (NOT recommended)."
689 exit "$EXIT_VALIDATION_ERROR"
692 log_success "All validation checks passed"
696 log_step "Phase 2: Setup"
698 # Create backup directory
699 log_info "Checking backup directory..."
700 if [[ ! -d "$FILE" ]]; then
701 if ! mkdir -p "$FILE"; then
702 log_error "Failed to create backup directory: $FILE"
703 exit "$EXIT_BACKUP_ERROR"
705 log_success "Created backup directory: $FILE"
707 # Directory exists - check if already initialized
708 if [[ -f "$FILE/pg_scribe_metadata.txt" ]]; then
709 log_error "Backup directory already initialized: $FILE"
710 log_error "Metadata file exists: $FILE/pg_scribe_metadata.txt"
712 log_error "This directory has already been initialized with pg_scribe."
713 log_error "To take an additional full backup, use: pg_scribe --full-backup"
715 log_error "If you want to re-initialize from scratch:"
716 log_error " 1. Stop any running backup processes"
717 log_error " 2. Drop the replication slot (or verify it's safe to reuse)"
718 log_error " 3. Remove or rename the existing backup directory"
719 exit "$EXIT_VALIDATION_ERROR"
722 # Directory exists but not initialized - check if empty
723 if [[ -n "$(ls -A "$FILE" 2>/dev/null)" ]]; then
724 log_error "Backup directory is not empty: $FILE"
725 log_error "The backup directory must be empty for initialization."
726 log_error "Found existing files:"
727 # shellcheck disable=SC2012 # ls used for user-friendly display, not processing
728 ls -lh "$FILE" | head -10 >&2
729 exit "$EXIT_VALIDATION_ERROR"
732 log_info "Using existing empty directory: $FILE"
735 # Create wal2sql extension
736 log_info "Creating wal2sql extension..."
737 if query_db_silent "CREATE EXTENSION IF NOT EXISTS wal2sql;"; then
738 log_success "wal2sql extension created (or already exists)"
740 log_error "Failed to create wal2sql extension"
741 log_error "Ensure wal2sql.so is installed in PostgreSQL's lib directory"
742 log_error "Run: cd wal2sql && make && make install"
743 exit "$EXIT_GENERAL_ERROR"
746 # Create replication slot with snapshot export
747 log_info "Creating logical replication slot '$SLOT'..."
749 # Check if slot already exists
750 check_replication_slot "$SLOT" 0
752 # Create slot using SQL
753 # Note: For POC, we create the slot and take the base backup sequentially
754 # The slot will preserve WAL from its creation LSN forward, ensuring no changes are lost
756 if ! slot_result=$(query_db "SELECT slot_name, lsn FROM pg_create_logical_replication_slot('$SLOT', 'wal2sql');"); then
757 log_error "Failed to create replication slot"
758 log_error "$slot_result"
759 exit "$EXIT_SLOT_ERROR"
762 CREATED_SLOT="$SLOT" # Track for cleanup
763 log_success "Replication slot '$SLOT' created"
765 # Take base backup immediately after slot creation
766 # The slot preserves WAL from its creation point, so all changes will be captured
768 timestamp=$(get_backup_timestamp)
769 local base_backup_file="$FILE/base-${timestamp}.sql"
770 CREATED_FILES+=("$base_backup_file") # Track for cleanup
771 log_info "Taking base backup: $base_backup_file"
774 mapfile -t psql_args < <(build_psql_args)
775 if pg_dump "${psql_args[@]}" --file="$base_backup_file"; then
776 log_success "Base backup completed: $base_backup_file"
778 log_error "Base backup failed"
779 exit "$EXIT_BACKUP_ERROR"
782 # Take globals backup (using same timestamp for consistency)
783 local globals_backup_file
784 globals_backup_file=$(take_globals_backup "$FILE" "$timestamp")
785 CREATED_FILES+=("$globals_backup_file") # Track for cleanup
787 # Generate metadata file
788 log_info "Generating metadata file..."
789 local metadata_file="$FILE/pg_scribe_metadata.txt"
790 CREATED_FILES+=("$metadata_file") # Track for cleanup
792 pg_version=$(query_db "SELECT version();")
794 cat > "$metadata_file" <<EOF
795 pg_scribe Backup System Metadata
796 =================================
798 Generated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
799 pg_scribe Version: $VERSION
805 Replication Slot: $SLOT
808 $(query_db "SELECT extname || ' ' || extversion FROM pg_extension ORDER BY extname;")
810 Encoding: $(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")
812 Collation: $(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")
815 log_success "Metadata file created: $metadata_file"
817 # Disable cleanup trap on successful completion
822 log_step "Initialization Complete"
823 log_success "Backup directory: $FILE"
824 log_success "Replication slot: $SLOT"
825 log_info "Next steps:"
826 log_info " 1. Start streaming incremental backups:"
827 log_info " pg_scribe --start -d $DBNAME -f $FILE/incremental.sql -S $SLOT"
828 log_info " 2. Monitor replication slot health:"
829 log_info " pg_scribe --status -d $DBNAME -S $SLOT"
831 if [[ "$has_warnings" -eq 1 ]]; then
839 # --start command implementation
842 log_step "Starting incremental backup collection"
844 # Validate required arguments
845 validate_required_args "start" "DBNAME:-d/--dbname" "FILE:-f/--file (output file, or '-' for stdout)"
850 # Verify replication slot exists
851 log_step "Verifying replication slot '$SLOT'..."
852 check_replication_slot "$SLOT" 1
854 # Build pg_recvlogical arguments
855 local pg_recv_args=()
856 mapfile -t pg_recv_args < <(build_pg_recvlogical_args)
858 # Add required arguments
859 pg_recv_args+=(--slot="$SLOT")
860 pg_recv_args+=(--start)
861 pg_recv_args+=(--file="$FILE")
864 pg_recv_args+=(--option=include_transaction=on)
866 # Add status interval
867 pg_recv_args+=(--status-interval="$STATUS_INTERVAL")
869 # Add fsync interval (0 means disabled)
870 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
871 pg_recv_args+=(--fsync-interval="$FSYNC_INTERVAL")
873 # For fsync-interval=0, we skip the parameter to avoid pg_recvlogical errors
874 log_info "Fsync disabled (fsync-interval=0)"
877 # Display configuration
878 log_step "Configuration"
879 log_info "Database: $DBNAME"
880 log_info "Replication slot: $SLOT"
881 log_info "Output file: $FILE"
882 log_info "Status interval: ${STATUS_INTERVAL}s"
883 if [[ "$FSYNC_INTERVAL" -gt 0 ]]; then
884 log_info "Fsync interval: ${FSYNC_INTERVAL}s"
886 log_info "Fsync: disabled"
890 # Start streaming - replace this process with pg_recvlogical
891 log_step "Starting streaming replication..."
892 log_info "Press Ctrl+C to stop"
893 log_info "Send SIGHUP to rotate output file"
896 # Replace this process with pg_recvlogical
897 # This eliminates signal forwarding issues and prevents orphaned processes
898 # The PID stays the same, making cleanup in tests more reliable
899 exec pg_recvlogical "${pg_recv_args[@]}"
903 # --full-backup command implementation
906 log_step "Taking full backup"
908 # Validate required arguments
909 validate_required_args "full-backup" "DBNAME:-d/--dbname" "FILE:-f/--file (backup directory)"
914 # Ensure backup directory exists
915 if [[ ! -d "$FILE" ]]; then
916 log_error "Backup directory does not exist: $FILE"
917 log_error "Create the directory first or run --init to initialize the backup system"
918 exit "$EXIT_BACKUP_ERROR"
921 # Set compression method (default: gzip)
922 local compress_method="${COMPRESS:-gzip}"
923 if [[ "$compress_method" == "none" ]]; then
927 # Generate timestamped filenames
929 timestamp=$(get_backup_timestamp)
930 local base_backup_file="$FILE/base-${timestamp}.sql"
932 # Add compression extension to base backup if applicable
933 # Note: We don't compress globals since it's typically very small (< 1KB)
934 if [[ -n "$compress_method" ]]; then
935 # Extract compression type (before colon)
936 local compress_type="${compress_method%%:*}"
937 case "$compress_type" in
939 base_backup_file="${base_backup_file}.gz"
942 base_backup_file="${base_backup_file}.lz4"
945 base_backup_file="${base_backup_file}.zst"
948 log_error "Unknown compression method: $compress_type"
949 log_error "Supported methods: gzip, lz4, zstd, none"
950 exit "$EXIT_VALIDATION_ERROR"
956 log_info "Taking base backup: $base_backup_file"
957 if [[ -n "$compress_method" ]]; then
958 log_info "Compression: $compress_method"
962 mapfile -t psql_args < <(build_psql_args)
964 # Build pg_dump command
965 local pg_dump_args=("${psql_args[@]}")
966 if [[ -n "$compress_method" ]]; then
967 pg_dump_args+=(--compress="$compress_method")
969 pg_dump_args+=(--file="$base_backup_file")
971 if pg_dump "${pg_dump_args[@]}"; then
973 backup_size=$(get_file_size "$base_backup_file")
974 log_success "Base backup completed: $base_backup_file ($backup_size)"
976 log_error "Base backup failed"
977 # Clean up partial file
978 rm -f "$base_backup_file" 2>/dev/null || true
979 exit "$EXIT_BACKUP_ERROR"
982 # Take globals backup (uncompressed - typically < 1KB, not worth compressing)
983 local globals_backup_file
984 globals_backup_file=$(take_globals_backup "$FILE" "$timestamp")
986 # Generate/update metadata file
987 log_info "Updating metadata file..."
988 local metadata_file="$FILE/pg_scribe_metadata.txt"
990 pg_version=$(query_db "SELECT version();")
992 cat > "$metadata_file" <<EOF
993 pg_scribe Backup System Metadata
994 =================================
996 Last Updated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
997 pg_scribe Version: $VERSION
1005 Base: $(basename "$base_backup_file")
1006 Globals: $(basename "$globals_backup_file")
1007 Timestamp: $timestamp
1010 $(query_db "SELECT extname || ' ' || extversion FROM pg_extension ORDER BY extname;")
1012 Encoding: $(query_db "SELECT pg_encoding_to_char(encoding) FROM pg_database WHERE datname = '$DBNAME';")
1014 Collation: $(query_db "SELECT datcollate FROM pg_database WHERE datname = '$DBNAME';")
1017 log_success "Metadata file updated: $metadata_file"
1021 log_step "Full Backup Complete"
1022 log_success "Base backup: $base_backup_file"
1023 log_success "Globals backup: $globals_backup_file"
1024 log_success "Backup directory: $FILE"
1026 exit "$EXIT_SUCCESS"
1030 # --restore command implementation
1033 log_step "Restoring database from backup"
1035 # Validate required arguments
1036 validate_required_args "restore" "DBNAME:-d/--dbname (target database)" "FILE:-f/--file (backup directory)"
1038 # Verify backup directory exists
1039 if [[ ! -d "$FILE" ]]; then
1040 log_error "Backup directory does not exist: $FILE"
1041 exit "$EXIT_BACKUP_ERROR"
1045 log_step "Locating backups"
1046 local base_backup_path=""
1048 if [[ -n "$BASE_BACKUP" ]]; then
1049 # Use specified base backup
1050 if [[ ! -f "$BASE_BACKUP" ]]; then
1051 log_error "Specified base backup not found: $BASE_BACKUP"
1052 exit "$EXIT_BACKUP_ERROR"
1054 base_backup_path="$BASE_BACKUP"
1055 log_info "Using specified base backup: $(basename "$base_backup_path")"
1057 # Find latest base backup (uncompressed or compressed)
1059 latest_base=$(find "$FILE" -maxdepth 1 \( -name 'base-*.sql' -o -name 'base-*.sql.gz' -o -name 'base-*.sql.zst' -o -name 'base-*.sql.lz4' \) -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1061 if [[ -z "$latest_base" ]]; then
1062 log_error "No base backup found in directory: $FILE"
1063 log_error "Run --init or --full-backup first to create a base backup"
1064 exit "$EXIT_BACKUP_ERROR"
1067 base_backup_path="$latest_base"
1068 log_info "Found base backup: $(basename "$base_backup_path")"
1071 # Extract timestamp from base backup filename for finding matching globals
1072 local base_timestamp
1073 base_timestamp=$(basename "$base_backup_path" | sed -E 's/base-([0-9]{8}-[0-9]{6}).*/\1/')
1075 # Find matching globals backup
1076 local globals_backup_path
1077 globals_backup_path=$(find "$FILE" -maxdepth 1 -name "globals-${base_timestamp}.sql" 2>/dev/null | head -1)
1079 if [[ -z "$globals_backup_path" ]]; then
1080 # Try to find any globals backup as fallback
1081 globals_backup_path=$(find "$FILE" -maxdepth 1 -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1083 if [[ -n "$globals_backup_path" ]]; then
1084 log_warning "Exact matching globals backup not found, using: $(basename "$globals_backup_path")"
1086 log_warning "No globals backup found (roles and tablespaces will not be restored)"
1089 log_info "Found globals backup: $(basename "$globals_backup_path")"
1092 # Find incremental backups (if any)
1093 local incremental_files=()
1094 mapfile -t incremental_files < <(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -n | cut -d' ' -f2-)
1096 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1097 log_info "Found ${#incremental_files[@]} incremental backup file(s)"
1099 log_info "No incremental backup files found (will restore base backup only)"
1102 # Create target database if requested
1103 if [[ "$CREATE_DB" -eq 1 ]]; then
1104 log_step "Creating target database"
1106 # Connect to postgres database (not target database) to create it
1107 local create_dbname="$DBNAME"
1110 # Test connection to postgres database
1113 # Check if database already exists
1115 db_exists=$(query_db "SELECT count(*) FROM pg_database WHERE datname = '$create_dbname';")
1117 if [[ "$db_exists" -gt 0 ]]; then
1118 log_error "Database '$create_dbname' already exists"
1119 log_error "Drop it first or omit --create flag to restore into existing database"
1120 exit "$EXIT_BACKUP_ERROR"
1124 if query_db_silent "CREATE DATABASE \"$create_dbname\";"; then
1125 log_success "Created database: $create_dbname"
1127 log_error "Failed to create database: $create_dbname"
1128 exit "$EXIT_BACKUP_ERROR"
1131 # Switch back to target database for subsequent operations
1132 DBNAME="$create_dbname"
1135 # Test connection to target database
1138 # Restore globals backup
1139 if [[ -n "$globals_backup_path" ]]; then
1140 log_step "Restoring globals (roles, tablespaces)"
1142 # Build connection args for psql
1143 # Note: globals must be restored to postgres database, not target database
1144 local save_dbname="$DBNAME"
1147 mapfile -t psql_args < <(build_psql_args)
1148 DBNAME="$save_dbname"
1150 if psql "${psql_args[@]}" -f "$globals_backup_path" >/dev/null 2>&1; then
1151 log_success "Globals restored successfully"
1153 log_warning "Globals restore had errors (may be expected if roles already exist)"
1157 # Restore base backup
1158 log_step "Restoring base backup"
1160 start_time=$(date +%s)
1163 mapfile -t psql_args < <(build_psql_args)
1165 # Handle compressed backups
1166 if [[ "$base_backup_path" == *.gz ]]; then
1167 log_info "Decompressing gzip backup..."
1168 if gunzip -c "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1169 log_success "Base backup restored successfully"
1171 log_error "Base backup restore failed"
1172 exit "$EXIT_BACKUP_ERROR"
1174 elif [[ "$base_backup_path" == *.zst ]]; then
1175 log_info "Decompressing zstd backup..."
1176 if zstd -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1177 log_success "Base backup restored successfully"
1179 log_error "Base backup restore failed"
1180 exit "$EXIT_BACKUP_ERROR"
1182 elif [[ "$base_backup_path" == *.lz4 ]]; then
1183 log_info "Decompressing lz4 backup..."
1184 if lz4 -dc "$base_backup_path" | psql "${psql_args[@]}" >/dev/null 2>&1; then
1185 log_success "Base backup restored successfully"
1187 log_error "Base backup restore failed"
1188 exit "$EXIT_BACKUP_ERROR"
1191 # Uncompressed backup
1192 if psql "${psql_args[@]}" -f "$base_backup_path" >/dev/null 2>&1; then
1193 log_success "Base backup restored successfully"
1195 log_error "Base backup restore failed"
1196 exit "$EXIT_BACKUP_ERROR"
1200 # Apply incremental backups
1201 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1202 log_step "Applying incremental backups"
1204 for inc_file in "${incremental_files[@]}"; do
1205 log_info "Applying: $(basename "$inc_file")"
1207 if psql "${psql_args[@]}" -f "$inc_file" >/dev/null 2>&1; then
1208 if [[ "$VERBOSE" -eq 1 ]]; then
1209 log_success "Applied: $(basename "$inc_file")"
1212 log_error "Failed to apply incremental backup: $(basename "$inc_file")"
1213 log_error "Restore is incomplete"
1214 exit "$EXIT_BACKUP_ERROR"
1218 log_success "All incremental backups applied successfully"
1221 # Synchronize sequences
1222 if [[ "$NO_SYNC_SEQUENCES" -eq 0 ]]; then
1223 log_step "Synchronizing sequences"
1225 # Query all sequences and their associated tables
1227 seq_sync_sql=$(query_db "
1230 quote_literal(sn.nspname || '.' || s.relname) ||
1231 ', GREATEST((SELECT COALESCE(MAX(' ||
1232 quote_ident(a.attname) ||
1234 quote_ident(tn.nspname) || '.' || quote_ident(t.relname) ||
1237 JOIN pg_namespace sn ON sn.oid = s.relnamespace
1238 JOIN pg_depend d ON d.objid = s.oid AND d.deptype = 'a'
1239 JOIN pg_class t ON t.oid = d.refobjid
1240 JOIN pg_namespace tn ON tn.oid = t.relnamespace
1241 JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = d.refobjsubid
1242 WHERE s.relkind = 'S'
1243 AND sn.nspname NOT IN ('pg_catalog', 'information_schema')
1244 ORDER BY sn.nspname, s.relname;
1247 if [[ -n "$seq_sync_sql" ]]; then
1249 while IFS= read -r sync_cmd; do
1250 if query_db_silent "$sync_cmd"; then
1251 seq_count=$((seq_count + 1))
1252 if [[ "$VERBOSE" -eq 1 ]]; then
1253 log_info "Synced sequence: $(echo "$sync_cmd" | grep -oP "'\K[^']+(?=')")"
1256 log_warning "Failed to sync sequence: $sync_cmd"
1258 done <<< "$seq_sync_sql"
1260 log_success "Synchronized $seq_count sequence(s)"
1262 log_info "No sequences found to synchronize"
1265 log_info "Skipping sequence synchronization (--no-sync-sequences specified)"
1268 # Calculate restore duration
1270 end_time=$(date +%s)
1271 local duration=$((end_time - start_time))
1274 log_step "Restore Statistics"
1276 # Count rows in all tables
1277 log_info "Counting rows in restored tables..."
1279 table_count=$(query_db "SELECT count(*) FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.relkind = 'r' AND n.nspname NOT IN ('pg_catalog', 'information_schema');" 2>/dev/null)
1282 total_rows=$(query_db "
1283 SELECT COALESCE(SUM(n_live_tup), 0)
1284 FROM pg_stat_user_tables;
1287 echo -e "${BOLD}Database:${RESET} $DBNAME" >&2
1288 echo -e "${BOLD}Tables Restored:${RESET} $table_count" >&2
1289 echo -e "${BOLD}Total Rows:${RESET} $total_rows (approximate)" >&2
1290 echo -e "${BOLD}Duration:${RESET} ${duration}s" >&2
1291 echo -e "${BOLD}Base Backup:${RESET} $(basename "$base_backup_path")" >&2
1293 if [[ ${#incremental_files[@]} -gt 0 ]]; then
1294 echo -e "${BOLD}Incremental Files:${RESET} ${#incremental_files[@]}" >&2
1297 # Final success message
1299 log_step "Restore Complete"
1300 log_success "Database successfully restored to: $DBNAME"
1301 log_info "Next steps:"
1302 log_info " 1. Verify data integrity:"
1303 log_info " psql -d $DBNAME -c 'SELECT COUNT(*) FROM <your_table>;'"
1304 log_info " 2. Run application smoke tests"
1305 log_info " 3. Switch application to restored database"
1307 exit "$EXIT_SUCCESS"
1311 # --status command implementation
1314 log_step "Checking pg_scribe backup system status"
1316 # Validate required arguments
1317 validate_required_args "status" "DBNAME:-d/--dbname"
1322 # Track warnings for exit code
1323 local has_warnings=0
1325 # Check replication slot status
1326 log_step "Replication Slot Status"
1328 # Verify replication slot exists
1329 check_replication_slot "$SLOT" 1
1331 # Query slot details
1333 slot_info=$(query_db "
1340 confirmed_flush_lsn,
1341 pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn) as restart_lag_bytes,
1342 pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn) as confirmed_lag_bytes,
1343 pg_current_wal_lsn() as current_lsn
1344 FROM pg_replication_slots
1345 WHERE slot_name = '$SLOT';
1349 IFS='|' read -r slot_name slot_type db_name active restart_lsn confirmed_flush_lsn restart_lag_bytes confirmed_lag_bytes current_lsn <<< "$slot_info"
1351 # Display slot information
1352 echo -e "${BOLD}Slot Name:${RESET} $slot_name" >&2
1353 echo -e "${BOLD}Slot Type:${RESET} $slot_type" >&2
1354 echo -e "${BOLD}Database:${RESET} $db_name" >&2
1356 if [[ "$active" == "t" ]]; then
1357 echo -e "${BOLD}Active:${RESET} ${GREEN}Yes${RESET}" >&2
1359 echo -e "${BOLD}Active:${RESET} ${YELLOW}No${RESET}" >&2
1360 log_warning "Replication slot is not active"
1364 echo -e "${BOLD}Current WAL LSN:${RESET} $current_lsn" >&2
1365 echo -e "${BOLD}Restart LSN:${RESET} $restart_lsn" >&2
1366 echo -e "${BOLD}Confirmed LSN:${RESET} $confirmed_flush_lsn" >&2
1368 # Format lag in human-readable sizes
1369 local restart_lag_mb=$((restart_lag_bytes / 1024 / 1024))
1370 local confirmed_lag_mb=$((confirmed_lag_bytes / 1024 / 1024))
1372 # Check lag thresholds (based on design doc)
1373 if [[ "$restart_lag_bytes" -gt 10737418240 ]]; then
1375 echo -e "${BOLD}Restart Lag:${RESET} ${RED}${restart_lag_mb} MB (CRITICAL!)${RESET}" >&2
1376 log_error "CRITICAL: Replication lag exceeds 10GB!"
1377 log_error " This may cause disk space issues or database shutdown"
1378 log_error " Consider dropping the slot if backup collection has stopped"
1380 elif [[ "$restart_lag_bytes" -gt 1073741824 ]]; then
1382 echo -e "${BOLD}Restart Lag:${RESET} ${YELLOW}${restart_lag_mb} MB (WARNING)${RESET}" >&2
1383 log_warning "Replication lag exceeds 1GB"
1384 log_warning " Ensure backup collection is running and healthy"
1387 echo -e "${BOLD}Restart Lag:${RESET} ${GREEN}${restart_lag_mb} MB${RESET}" >&2
1390 if [[ "$confirmed_lag_bytes" -gt 10737418240 ]]; then
1391 echo -e "${BOLD}Confirmed Lag:${RESET} ${RED}${confirmed_lag_mb} MB (CRITICAL!)${RESET}" >&2
1393 elif [[ "$confirmed_lag_bytes" -gt 1073741824 ]]; then
1394 echo -e "${BOLD}Confirmed Lag:${RESET} ${YELLOW}${confirmed_lag_mb} MB (WARNING)${RESET}" >&2
1397 echo -e "${BOLD}Confirmed Lag:${RESET} ${GREEN}${confirmed_lag_mb} MB${RESET}" >&2
1400 # Check slot age (if we can determine it)
1401 # Note: pg_replication_slots doesn't directly track creation time, but we can estimate from WAL
1404 # Analyze backup directory if provided
1405 if [[ -n "$FILE" ]]; then
1406 log_step "Backup Directory Analysis"
1408 if [[ ! -d "$FILE" ]]; then
1409 log_warning "Backup directory does not exist: $FILE"
1412 # Count base backups
1414 base_count=$(find "$FILE" -maxdepth 1 -name 'base-*.sql' 2>/dev/null | wc -l)
1415 echo -e "${BOLD}Base Backups:${RESET} $base_count" >&2
1417 if [[ "$base_count" -gt 0 ]]; then
1418 # Show latest base backup
1420 latest_base=$(find "$FILE" -maxdepth 1 -name 'base-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1422 base_name=$(basename "$latest_base")
1424 base_date=$(stat -c %y "$latest_base" 2>/dev/null | cut -d. -f1)
1425 echo -e "${BOLD}Latest Base:${RESET} $base_name ($base_date)" >&2
1427 # Show base backup size
1429 base_size=$(get_file_size "$latest_base")
1430 echo -e "${BOLD}Base Size:${RESET} $base_size" >&2
1432 log_warning "No base backups found in directory"
1436 # Count globals backups
1438 globals_count=$(find "$FILE" -maxdepth 1 -name 'globals-*.sql' 2>/dev/null | wc -l)
1439 echo -e "${BOLD}Globals Backups:${RESET} $globals_count" >&2
1441 # Check for incremental backup files
1442 # Note: Incremental files are created by --start command
1443 # They may have various names depending on configuration
1444 local incremental_count
1445 incremental_count=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' 2>/dev/null | wc -l)
1447 if [[ "$incremental_count" -gt 0 ]]; then
1448 echo -e "${BOLD}Incremental Files:${RESET} $incremental_count" >&2
1450 # Calculate total size of incremental files
1451 local incremental_size
1452 incremental_size=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -exec du -ch {} + 2>/dev/null | grep total | cut -f1)
1453 echo -e "${BOLD}Incremental Size:${RESET} $incremental_size" >&2
1455 # Show most recent incremental file
1456 local latest_incremental
1457 latest_incremental=$(find "$FILE" -maxdepth 1 -name '*.sql' ! -name 'base-*.sql' ! -name 'globals-*.sql' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2-)
1458 if [[ -n "$latest_incremental" ]]; then
1460 inc_name=$(basename "$latest_incremental")
1462 inc_date=$(stat -c %y "$latest_incremental" 2>/dev/null | cut -d. -f1)
1463 local inc_age_seconds
1464 inc_age_seconds=$(( $(date +%s) - $(stat -c %Y "$latest_incremental" 2>/dev/null) ))
1465 local inc_age_minutes=$((inc_age_seconds / 60))
1467 echo -e "${BOLD}Latest Incremental:${RESET} $inc_name ($inc_date)" >&2
1469 # Warn if last incremental is old
1470 if [[ "$inc_age_minutes" -gt 60 ]]; then
1471 log_warning "Last incremental backup is ${inc_age_minutes} minutes old"
1472 log_warning " Verify that backup collection (--start) is running"
1477 log_warning "No incremental backup files found"
1478 log_warning " Start incremental backup collection with: pg_scribe --start"
1482 # Check for metadata file
1483 if [[ -f "$FILE/pg_scribe_metadata.txt" ]]; then
1484 echo -e "${BOLD}Metadata File:${RESET} Present" >&2
1486 # Extract some metadata
1487 local pg_version_line
1488 pg_version_line=$(grep "PostgreSQL" "$FILE/pg_scribe_metadata.txt" 2>/dev/null | head -1)
1489 if [[ -n "$pg_version_line" ]]; then
1490 echo -e "${BOLD}Backup PG Version:${RESET} $pg_version_line" >&2
1493 log_warning "Metadata file not found"
1497 # Calculate total backup directory size
1499 total_size=$(du -sh "$FILE" 2>/dev/null | cut -f1)
1500 echo -e "${BOLD}Total Directory Size:${RESET} $total_size" >&2
1504 # Overall health summary
1506 log_step "Health Summary"
1508 if [[ "$has_warnings" -eq 0 ]]; then
1509 log_success "System is healthy"
1511 log_info "Replication slot is active and lag is acceptable"
1512 if [[ -n "$FILE" ]]; then
1513 log_info "Backup directory appears healthy"
1515 exit "$EXIT_SUCCESS"
1517 log_warning "System has warnings - review messages above"
1519 log_info "Address any CRITICAL or WARNING issues promptly"
1520 log_info "See design doc for monitoring recommendations"
1521 exit "$EXIT_WARNING"
1546 log_error "Unknown action: $ACTION"
1547 exit "$EXIT_GENERAL_ERROR"
1552 # Run main with all arguments