begriffs open source - ai-pg/blob - build_toc.py

   1 #!/usr/bin/env python3
   2 """
   3 Build a complete, detailed table of contents for PostgreSQL 18 documentation.
   4
   5 Simplified Algorithm:
   6 1. Parse index.html to get complete hierarchical structure
   7 2. For each file referenced, visit it ONCE and extract ALL TOC entries
   8 3. Entries linking to SAME file are text-only (no link)
   9 4. Entries linking to DIFFERENT files are clickable links
  10 5. No recursive crawling - just process each file's internal TOC once
  11 """
  12
  13 import os
  14 import re
  15 from pathlib import Path
  16 from lxml import html as lhtml
  17 from collections import OrderedDict
  18
  19
  20 def read_html_file(file_path):
  21     """Read HTML file content as bytes for lxml parsing."""
  22     try:
  23         with open(file_path, 'rb') as f:
  24             return f.read()
  25     except Exception as e:
  26         print(f"Error reading {file_path}: {e}")
  27         return None
  28
  29
  30 def extract_text_from_element(element):
  31     """Extract clean text from an lxml element, handling nested tags."""
  32     return ''.join(element.itertext()).strip()
  33
  34
  35 def parse_toc_from_html(html_content, current_filename=None):
  36     """
  37     Parse TOC structure from HTML content.
  38     Returns list of dicts with: level, text, href, filename, is_same_file
  39
  40     Args:
  41         html_content: The HTML content to parse
  42         current_filename: The filename being parsed (to detect same-file anchors)
  43     """
  44     entries = []
  45
  46     try:
  47         tree = lhtml.fromstring(html_content)
  48         toc_divs = tree.xpath('.//div[@class="toc"]')
  49
  50         if not toc_divs:
  51             return entries
  52
  53         toc_div = toc_divs[0]
  54
  55         # Find all TOC entries with various class types
  56         for span in toc_div.xpath('.//span[@class]'):
  57             span_class = span.get('class')
  58
  59             # Determine entry type and level
  60             if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']:
  61                 level = 0
  62             elif span_class.startswith('sect'):
  63                 match = re.match(r'sect(\d+)', span_class)
  64                 if match:
  65                     level = int(match.group(1))
  66                 else:
  67                     continue
  68             else:
  69                 continue
  70
  71             # Extract link and text
  72             links = span.xpath('.//a[@href]')
  73             if links:
  74                 link = links[0]
  75                 href = link.get('href', '')
  76                 text = extract_text_from_element(link)
  77
  78                 if text and href and not href.startswith('http'):
  79                     # Extract filename (without anchor)
  80                     filename = href.split('#')[0] if '#' in href else href
  81
  82                     # Determine if this is a same-file anchor
  83                     is_same_file = (filename == '' or filename == current_filename)
  84
  85                     entries.append({
  86                         'level': level,
  87                         'text': text,
  88                         'href': href,
  89                         'filename': filename if filename else current_filename,
  90                         'is_same_file': is_same_file,
  91                         'class': span_class
  92                     })
  93
  94     except Exception as e:
  95         print(f"Error parsing TOC: {e}")
  96
  97     return entries
  98
  99
 100 def build_complete_toc(docs_dir):
 101     """
 102     Build complete TOC by parsing index.html and then processing each referenced file once.
 103     """
 104
 105     index_path = os.path.join(docs_dir, 'index.html')
 106     if not os.path.exists(index_path):
 107         print(f"Error: {index_path} not found")
 108         return None
 109
 110     print("Reading index.html...")
 111     index_html = read_html_file(index_path)
 112     if not index_html:
 113         return None
 114
 115     # Get main structure from index
 116     print("Parsing main structure...")
 117     main_entries = parse_toc_from_html(index_html, 'index.html')
 118     print(f"Found {len(main_entries)} main entries in index.html")
 119
 120     # Track which files we've processed to avoid duplicates
 121     processed_files = {'index.html'}
 122
 123     # Collect all unique files to process
 124     files_to_process = OrderedDict()
 125     for entry in main_entries:
 126         filename = entry['filename']
 127         if filename and filename != 'index.html' and filename not in files_to_process:
 128             files_to_process[filename] = entry
 129
 130     print(f"Found {len(files_to_process)} unique files to process")
 131
 132     # Build output
 133     output_lines = []
 134
 135     def add_entry(entry, indent_level, parent_filename=None):
 136         """Add entry to output with appropriate formatting."""
 137         indent = '  ' * indent_level
 138         text = entry['text']
 139         filename = entry['filename']
 140
 141         # Normalize spacing in text (e.g., "1.  What" -> "1. What")
 142         text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
 143
 144         # Add link if this entry is on a different page than its parent
 145         # or if it has no parent (top-level from index.html)
 146         if parent_filename is None or filename != parent_filename:
 147             # Entry on different page gets a link
 148             output_lines.append(f"{indent}- [{text}](full-docs/html/{filename})\n")
 149         else:
 150             # Entry on same page as parent is text-only
 151             output_lines.append(f"{indent}- {text}\n")
 152
 153     # Process each unique file
 154     # Use a while loop to allow dynamic addition of new files
 155     print("\nProcessing files...")
 156     processed_count = 0
 157     files_list = list(files_to_process.items())
 158     file_index = 0
 159
 160     while file_index < len(files_list):
 161         filename, main_entry = files_list[file_index]
 162         file_index += 1
 163         processed_count += 1
 164
 165         file_path = os.path.join(docs_dir, filename)
 166
 167         if not os.path.exists(file_path):
 168             print(f"[{processed_count}] SKIP {filename} (not found)")
 169             continue
 170
 171         print(f"[{processed_count}] {main_entry['text']}")
 172
 173         # Add the main entry from index.html (no parent, gets link)
 174         add_entry(main_entry, indent_level=0, parent_filename=None)
 175
 176         # Read the file and extract its internal TOC
 177         html_content = read_html_file(file_path)
 178         if html_content:
 179             internal_entries = parse_toc_from_html(html_content, filename)
 180
 181             # Add all internal entries (children of main entry)
 182             for entry in internal_entries:
 183                 # Calculate indent: base level (1) + entry's internal level
 184                 indent_level = 1 + entry['level']
 185                 # Pass the main entry's filename as parent
 186                 add_entry(entry, indent_level, parent_filename=filename)
 187
 188                 # If this entry points to a NEW file we haven't seen, add it to our queue
 189                 if not entry['is_same_file'] and entry['filename'] not in processed_files:
 190                     child_filename = entry['filename']
 191                     if child_filename not in files_to_process:
 192                         files_to_process[child_filename] = entry
 193                         files_list.append((child_filename, entry))
 194
 195         processed_files.add(filename)
 196
 197     print(f"\nTotal files processed: {len(processed_files)}")
 198     print(f"Total output lines: {len(output_lines)}")
 199
 200     return output_lines
 201
 202
 203 def main():
 204     # Use paths relative to script location
 205     script_dir = Path(__file__).parent
 206     docs_dir = script_dir / 'full-docs' / 'html'
 207     output_file = script_dir / 'full_toc.md'
 208
 209     print("=" * 70)
 210     print("Building Complete PostgreSQL 18 Documentation Table of Contents")
 211     print("=" * 70)
 212     print()
 213
 214     output_lines = build_complete_toc(docs_dir)
 215
 216     if output_lines:
 217         with open(output_file, 'w', encoding='utf-8') as f:
 218             f.writelines(output_lines)
 219
 220         print()
 221         print("=" * 70)
 222         print(f"Complete TOC written to: {output_file}")
 223         print(f"Total lines: {len(output_lines):,}")
 224         print("=" * 70)
 225         return 0
 226     else:
 227         print("Failed to build TOC")
 228         return 1
 229
 230
 231 if __name__ == '__main__':
 232     exit(main())