begriffs open source - ai-pg/blob - build_toc.py

   1 #!/usr/bin/env python3
   2 """
   3 Build a complete, detailed table of contents for PostgreSQL 18 documentation.
   4
   5 Simplified Algorithm:
   6 1. Parse index.html to get complete hierarchical structure
   7 2. For each file referenced, visit it ONCE and extract ALL TOC entries
   8 3. Entries linking to SAME file are text-only (no link)
   9 4. Entries linking to DIFFERENT files are clickable links
  10 5. No recursive crawling - just process each file's internal TOC once
  11 """
  12
  13 import os
  14 import re
  15 from pathlib import Path
  16 from lxml import html as lhtml
  17 from collections import OrderedDict
  18
  19
  20 def read_html_file(file_path):
  21     """Read HTML file content as bytes for lxml parsing."""
  22     try:
  23         with open(file_path, 'rb') as f:
  24             return f.read()
  25     except Exception as e:
  26         print(f"Error reading {file_path}: {e}")
  27         return None
  28
  29
  30 def extract_text_from_element(element):
  31     """Extract clean text from an lxml element, handling nested tags."""
  32     return ''.join(element.itertext()).strip()
  33
  34
  35 def parse_toc_from_html(html_content, current_filename=None):
  36     """
  37     Parse TOC structure from HTML content.
  38     Returns list of dicts with: level, text, href, filename, is_same_file
  39
  40     Args:
  41         html_content: The HTML content to parse
  42         current_filename: The filename being parsed (to detect same-file anchors)
  43     """
  44     entries = []
  45
  46     try:
  47         tree = lhtml.fromstring(html_content)
  48         toc_divs = tree.xpath('.//div[@class="toc"]')
  49
  50         if not toc_divs:
  51             return entries
  52
  53         toc_div = toc_divs[0]
  54
  55         # Find all TOC entries with various class types
  56         for span in toc_div.xpath('.//span[@class]'):
  57             span_class = span.get('class')
  58
  59             # Determine entry type and level
  60             if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']:
  61                 level = 0
  62             elif span_class.startswith('sect'):
  63                 match = re.match(r'sect(\d+)', span_class)
  64                 if match:
  65                     level = int(match.group(1))
  66                 else:
  67                     continue
  68             else:
  69                 continue
  70
  71             # Extract link and text
  72             links = span.xpath('.//a[@href]')
  73             if links:
  74                 link = links[0]
  75                 href = link.get('href', '')
  76                 text = extract_text_from_element(link)
  77
  78                 if text and href and not href.startswith('http'):
  79                     # Extract filename (without anchor)
  80                     filename = href.split('#')[0] if '#' in href else href
  81
  82                     # Determine if this is a same-file anchor
  83                     is_same_file = (filename == '' or filename == current_filename)
  84
  85                     entries.append({
  86                         'level': level,
  87                         'text': text,
  88                         'href': href,
  89                         'filename': filename if filename else current_filename,
  90                         'is_same_file': is_same_file,
  91                         'class': span_class
  92                     })
  93
  94     except Exception as e:
  95         print(f"Error parsing TOC: {e}")
  96
  97     return entries
  98
  99
 100 def build_complete_toc(docs_dir):
 101     """
 102     Build complete TOC by parsing index.html and then processing each referenced file once.
 103     """
 104
 105     index_path = os.path.join(docs_dir, 'index.html')
 106     if not os.path.exists(index_path):
 107         print(f"Error: {index_path} not found")
 108         return None
 109
 110     print("Reading index.html...")
 111     index_html = read_html_file(index_path)
 112     if not index_html:
 113         return None
 114
 115     # Get main structure from index
 116     print("Parsing main structure...")
 117     main_entries = parse_toc_from_html(index_html, 'index.html')
 118     print(f"Found {len(main_entries)} main entries in index.html")
 119
 120     # Track which files we've processed to avoid duplicates
 121     processed_files = {'index.html'}
 122
 123     # Collect all unique files to process
 124     files_to_process = OrderedDict()
 125     for entry in main_entries:
 126         filename = entry['filename']
 127         if filename and filename != 'index.html' and filename not in files_to_process:
 128             files_to_process[filename] = entry
 129
 130     print(f"Found {len(files_to_process)} unique files to process")
 131
 132     # Build output
 133     output_lines = []
 134
 135     def add_entry(entry, indent_level, is_top_level=False):
 136         """Add entry to output with appropriate formatting."""
 137         indent = '  ' * indent_level
 138         text = entry['text']
 139         filename = entry['filename']
 140
 141         # Normalize spacing in text (e.g., "1.  What" -> "1. What")
 142         text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
 143
 144         # Only top-level entries (directly from index.html) get links
 145         # All children/subsections are text-only
 146         if is_top_level:
 147             # Top-level entries get links (no bold)
 148             output_lines.append(f"{indent}- [{text}](full-docs/src/sgml/html/{filename})\n")
 149         else:
 150             # All subsections are text-only
 151             output_lines.append(f"{indent}- {text}\n")
 152
 153     # Process each unique file
 154     # Use a while loop to allow dynamic addition of new files
 155     print("\nProcessing files...")
 156     processed_count = 0
 157     files_list = list(files_to_process.items())
 158     file_index = 0
 159
 160     while file_index < len(files_list):
 161         filename, main_entry = files_list[file_index]
 162         file_index += 1
 163         processed_count += 1
 164
 165         file_path = os.path.join(docs_dir, filename)
 166
 167         if not os.path.exists(file_path):
 168             print(f"[{processed_count}] SKIP {filename} (not found)")
 169             continue
 170
 171         print(f"[{processed_count}] {main_entry['text']}")
 172
 173         # Add the main entry from index.html (top-level, gets link)
 174         add_entry(main_entry, indent_level=0, is_top_level=True)
 175
 176         # Read the file and extract its internal TOC
 177         html_content = read_html_file(file_path)
 178         if html_content:
 179             internal_entries = parse_toc_from_html(html_content, filename)
 180
 181             # Add all internal entries (subsections, text-only)
 182             for entry in internal_entries:
 183                 # Calculate indent: base level (1) + entry's internal level
 184                 indent_level = 1 + entry['level']
 185                 add_entry(entry, indent_level, is_top_level=False)
 186
 187                 # If this entry points to a NEW file we haven't seen, add it to our queue
 188                 if not entry['is_same_file'] and entry['filename'] not in processed_files:
 189                     child_filename = entry['filename']
 190                     if child_filename not in files_to_process:
 191                         files_to_process[child_filename] = entry
 192                         files_list.append((child_filename, entry))
 193
 194         processed_files.add(filename)
 195
 196     print(f"\nTotal files processed: {len(processed_files)}")
 197     print(f"Total output lines: {len(output_lines)}")
 198
 199     return output_lines
 200
 201
 202 def main():
 203     # Use paths relative to script location
 204     script_dir = Path(__file__).parent
 205     docs_dir = script_dir / 'full-docs' / 'src' / 'sgml' / 'html'
 206     output_file = script_dir / 'full_toc.md'
 207
 208     print("=" * 70)
 209     print("Building Complete PostgreSQL 18 Documentation Table of Contents")
 210     print("=" * 70)
 211     print()
 212
 213     output_lines = build_complete_toc(docs_dir)
 214
 215     if output_lines:
 216         with open(output_file, 'w', encoding='utf-8') as f:
 217             f.writelines(output_lines)
 218
 219         print()
 220         print("=" * 70)
 221         print(f"Complete TOC written to: {output_file}")
 222         print(f"Total lines: {len(output_lines):,}")
 223         print("=" * 70)
 224         return 0
 225     else:
 226         print("Failed to build TOC")
 227         return 1
 228
 229
 230 if __name__ == '__main__':
 231     exit(main())