#!/usr/bin/env python3 """ Build a complete, detailed table of contents for PostgreSQL 18 documentation. Simplified Algorithm: 1. Parse index.html to get complete hierarchical structure 2. For each file referenced, visit it ONCE and extract ALL TOC entries 3. Entries linking to SAME file are text-only (no link) 4. Entries linking to DIFFERENT files are clickable links 5. No recursive crawling - just process each file's internal TOC once """ import os import re from pathlib import Path from lxml import html as lhtml from collections import OrderedDict def read_html_file(file_path): """Read HTML file content as bytes for lxml parsing.""" try: with open(file_path, 'rb') as f: return f.read() except Exception as e: print(f"Error reading {file_path}: {e}") return None def extract_text_from_element(element): """Extract clean text from an lxml element, handling nested tags.""" return ''.join(element.itertext()).strip() def parse_toc_from_html(html_content, current_filename=None): """ Parse TOC structure from HTML content. Returns list of dicts with: level, text, href, filename, is_same_file Args: html_content: The HTML content to parse current_filename: The filename being parsed (to detect same-file anchors) """ entries = [] try: tree = lhtml.fromstring(html_content) toc_divs = tree.xpath('.//div[@class="toc"]') if not toc_divs: return entries toc_div = toc_divs[0] # Find all TOC entries with various class types for span in toc_div.xpath('.//span[@class]'): span_class = span.get('class') # Determine entry type and level if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']: level = 0 elif span_class.startswith('sect'): match = re.match(r'sect(\d+)', span_class) if match: level = int(match.group(1)) else: continue else: continue # Extract link and text links = span.xpath('.//a[@href]') if links: link = links[0] href = link.get('href', '') text = extract_text_from_element(link) if text and href and not href.startswith('http'): # Extract filename (without anchor) filename = href.split('#')[0] if '#' in href else href # Determine if this is a same-file anchor is_same_file = (filename == '' or filename == current_filename) entries.append({ 'level': level, 'text': text, 'href': href, 'filename': filename if filename else current_filename, 'is_same_file': is_same_file, 'class': span_class }) except Exception as e: print(f"Error parsing TOC: {e}") return entries def build_complete_toc(docs_dir): """ Build complete TOC by parsing index.html and then processing each referenced file once. """ index_path = os.path.join(docs_dir, 'index.html') if not os.path.exists(index_path): print(f"Error: {index_path} not found") return None print("Reading index.html...") index_html = read_html_file(index_path) if not index_html: return None # Get main structure from index print("Parsing main structure...") main_entries = parse_toc_from_html(index_html, 'index.html') print(f"Found {len(main_entries)} main entries in index.html") # Track which files we've processed to avoid duplicates processed_files = {'index.html'} # Collect all unique files to process files_to_process = OrderedDict() for entry in main_entries: filename = entry['filename'] if filename and filename != 'index.html' and filename not in files_to_process: files_to_process[filename] = entry print(f"Found {len(files_to_process)} unique files to process") # Build output output_lines = [] def add_entry(entry, indent_level, parent_filename=None): """Add entry to output with appropriate formatting.""" indent = ' ' * indent_level text = entry['text'] filename = entry['filename'] # Normalize spacing in text (e.g., "1. What" -> "1. What") text = re.sub(r'(\d+)\.\s+', r'\1. ', text) # Add link if this entry is on a different page than its parent # or if it has no parent (top-level from index.html) if parent_filename is None or filename != parent_filename: # Entry on different page gets a link output_lines.append(f"{indent}- [{text}](full-docs/html/{filename})\n") else: # Entry on same page as parent is text-only output_lines.append(f"{indent}- {text}\n") # Process each unique file # Use a while loop to allow dynamic addition of new files print("\nProcessing files...") processed_count = 0 files_list = list(files_to_process.items()) file_index = 0 while file_index < len(files_list): filename, main_entry = files_list[file_index] file_index += 1 processed_count += 1 file_path = os.path.join(docs_dir, filename) if not os.path.exists(file_path): print(f"[{processed_count}] SKIP {filename} (not found)") continue print(f"[{processed_count}] {main_entry['text']}") # Add the main entry from index.html (no parent, gets link) add_entry(main_entry, indent_level=0, parent_filename=None) # Read the file and extract its internal TOC html_content = read_html_file(file_path) if html_content: internal_entries = parse_toc_from_html(html_content, filename) # Add all internal entries (children of main entry) for entry in internal_entries: # Calculate indent: base level (1) + entry's internal level indent_level = 1 + entry['level'] # Pass the main entry's filename as parent add_entry(entry, indent_level, parent_filename=filename) # If this entry points to a NEW file we haven't seen, add it to our queue if not entry['is_same_file'] and entry['filename'] not in processed_files: child_filename = entry['filename'] if child_filename not in files_to_process: files_to_process[child_filename] = entry files_list.append((child_filename, entry)) processed_files.add(filename) print(f"\nTotal files processed: {len(processed_files)}") print(f"Total output lines: {len(output_lines)}") return output_lines def main(): # Use paths relative to script location script_dir = Path(__file__).parent docs_dir = script_dir / 'full-docs' / 'html' output_file = script_dir / 'full_toc.md' print("=" * 70) print("Building Complete PostgreSQL 18 Documentation Table of Contents") print("=" * 70) print() output_lines = build_complete_toc(docs_dir) if output_lines: with open(output_file, 'w', encoding='utf-8') as f: f.writelines(output_lines) print() print("=" * 70) print(f"Complete TOC written to: {output_file}") print(f"Total lines: {len(output_lines):,}") print("=" * 70) return 0 else: print("Failed to build TOC") return 1 if __name__ == '__main__': exit(main())