3 Build a complete, detailed table of contents for PostgreSQL 18 documentation.
6 1. Parse index.html to get complete hierarchical structure
7 2. For each file referenced, visit it ONCE and extract ALL TOC entries
8 3. Entries linking to SAME file are text-only (no link)
9 4. Entries linking to DIFFERENT files are clickable links
10 5. No recursive crawling - just process each file's internal TOC once
15 from pathlib import Path
16 from lxml import html as lhtml
17 from collections import OrderedDict
20 def read_html_file(file_path):
21 """Read HTML file content as bytes for lxml parsing."""
23 with open(file_path, 'rb') as f:
25 except Exception as e:
26 print(f"Error reading {file_path}: {e}")
30 def extract_text_from_element(element):
31 """Extract clean text from an lxml element, handling nested tags."""
32 return ''.join(element.itertext()).strip()
35 def parse_toc_from_html(html_content, current_filename=None):
37 Parse TOC structure from HTML content.
38 Returns list of dicts with: level, text, href, filename, is_same_file
41 html_content: The HTML content to parse
42 current_filename: The filename being parsed (to detect same-file anchors)
47 tree = lhtml.fromstring(html_content)
48 toc_divs = tree.xpath('.//div[@class="toc"]')
55 # Find all TOC entries with various class types
56 for span in toc_div.xpath('.//span[@class]'):
57 span_class = span.get('class')
59 # Determine entry type and level
60 if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']:
62 elif span_class.startswith('sect'):
63 match = re.match(r'sect(\d+)', span_class)
65 level = int(match.group(1))
71 # Extract link and text
72 links = span.xpath('.//a[@href]')
75 href = link.get('href', '')
76 text = extract_text_from_element(link)
78 if text and href and not href.startswith('http'):
79 # Extract filename (without anchor)
80 filename = href.split('#')[0] if '#' in href else href
82 # Determine if this is a same-file anchor
83 is_same_file = (filename == '' or filename == current_filename)
89 'filename': filename if filename else current_filename,
90 'is_same_file': is_same_file,
94 except Exception as e:
95 print(f"Error parsing TOC: {e}")
100 def build_complete_toc(docs_dir):
102 Build complete TOC by parsing index.html and then processing each referenced file once.
105 index_path = os.path.join(docs_dir, 'index.html')
106 if not os.path.exists(index_path):
107 print(f"Error: {index_path} not found")
110 print("Reading index.html...")
111 index_html = read_html_file(index_path)
115 # Get main structure from index
116 print("Parsing main structure...")
117 main_entries = parse_toc_from_html(index_html, 'index.html')
118 print(f"Found {len(main_entries)} main entries in index.html")
120 # Track which files we've processed to avoid duplicates
121 processed_files = {'index.html'}
123 # Collect all unique files to process
124 files_to_process = OrderedDict()
125 for entry in main_entries:
126 filename = entry['filename']
127 if filename and filename != 'index.html' and filename not in files_to_process:
128 files_to_process[filename] = entry
130 print(f"Found {len(files_to_process)} unique files to process")
135 def add_entry(entry, indent_level, is_top_level=False):
136 """Add entry to output with appropriate formatting."""
137 indent = ' ' * indent_level
139 filename = entry['filename']
141 # Normalize spacing in text (e.g., "1. What" -> "1. What")
142 text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
144 # Only top-level entries (directly from index.html) get links
145 # All children/subsections are text-only
147 # Top-level entries get links (no bold)
148 output_lines.append(f"{indent}- [{text}](full-docs/src/sgml/html/{filename})\n")
150 # All subsections are text-only
151 output_lines.append(f"{indent}- {text}\n")
153 # Process each unique file
154 # Use a while loop to allow dynamic addition of new files
155 print("\nProcessing files...")
157 files_list = list(files_to_process.items())
160 while file_index < len(files_list):
161 filename, main_entry = files_list[file_index]
165 file_path = os.path.join(docs_dir, filename)
167 if not os.path.exists(file_path):
168 print(f"[{processed_count}] SKIP {filename} (not found)")
171 print(f"[{processed_count}] {main_entry['text']}")
173 # Add the main entry from index.html (top-level, gets link)
174 add_entry(main_entry, indent_level=0, is_top_level=True)
176 # Read the file and extract its internal TOC
177 html_content = read_html_file(file_path)
179 internal_entries = parse_toc_from_html(html_content, filename)
181 # Add all internal entries (subsections, text-only)
182 for entry in internal_entries:
183 # Calculate indent: base level (1) + entry's internal level
184 indent_level = 1 + entry['level']
185 add_entry(entry, indent_level, is_top_level=False)
187 # If this entry points to a NEW file we haven't seen, add it to our queue
188 if not entry['is_same_file'] and entry['filename'] not in processed_files:
189 child_filename = entry['filename']
190 if child_filename not in files_to_process:
191 files_to_process[child_filename] = entry
192 files_list.append((child_filename, entry))
194 processed_files.add(filename)
196 print(f"\nTotal files processed: {len(processed_files)}")
197 print(f"Total output lines: {len(output_lines)}")
203 # Use paths relative to script location
204 script_dir = Path(__file__).parent
205 docs_dir = script_dir / 'full-docs' / 'src' / 'sgml' / 'html'
206 output_file = script_dir / 'full_toc.md'
209 print("Building Complete PostgreSQL 18 Documentation Table of Contents")
213 output_lines = build_complete_toc(docs_dir)
216 with open(output_file, 'w', encoding='utf-8') as f:
217 f.writelines(output_lines)
221 print(f"Complete TOC written to: {output_file}")
222 print(f"Total lines: {len(output_lines):,}")
226 print("Failed to build TOC")
230 if __name__ == '__main__':