3 Build a complete, detailed table of contents for PostgreSQL 18 documentation.
6 1. Parse index.html to get complete hierarchical structure
7 2. For each file referenced, visit it ONCE and extract ALL TOC entries
8 3. Entries linking to SAME file are text-only (no link)
9 4. Entries linking to DIFFERENT files are clickable links
10 5. No recursive crawling - just process each file's internal TOC once
15 from pathlib import Path
16 from lxml import html as lhtml
17 from collections import OrderedDict
20 def read_html_file(file_path):
21 """Read HTML file content as bytes for lxml parsing."""
23 with open(file_path, 'rb') as f:
25 except Exception as e:
26 print(f"Error reading {file_path}: {e}")
30 def extract_text_from_element(element):
31 """Extract clean text from an lxml element, handling nested tags."""
32 return ''.join(element.itertext()).strip()
35 def parse_toc_from_html(html_content, current_filename=None):
37 Parse TOC structure from HTML content.
38 Returns list of dicts with: level, text, href, filename, is_same_file
41 html_content: The HTML content to parse
42 current_filename: The filename being parsed (to detect same-file anchors)
47 tree = lhtml.fromstring(html_content)
48 toc_divs = tree.xpath('.//div[@class="toc"]')
55 # Find all TOC entries with various class types
56 for span in toc_div.xpath('.//span[@class]'):
57 span_class = span.get('class')
59 # Determine entry type and level
60 if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']:
62 elif span_class.startswith('sect'):
63 match = re.match(r'sect(\d+)', span_class)
65 level = int(match.group(1))
71 # Extract link and text
72 links = span.xpath('.//a[@href]')
75 href = link.get('href', '')
76 text = extract_text_from_element(link)
78 if text and href and not href.startswith('http'):
79 # Extract filename (without anchor)
80 filename = href.split('#')[0] if '#' in href else href
82 # Determine if this is a same-file anchor
83 is_same_file = (filename == '' or filename == current_filename)
89 'filename': filename if filename else current_filename,
90 'is_same_file': is_same_file,
94 except Exception as e:
95 print(f"Error parsing TOC: {e}")
100 def build_complete_toc(docs_dir):
102 Build complete TOC by parsing index.html and then processing each referenced file once.
105 index_path = os.path.join(docs_dir, 'index.html')
106 if not os.path.exists(index_path):
107 print(f"Error: {index_path} not found")
110 print("Reading index.html...")
111 index_html = read_html_file(index_path)
115 # Get main structure from index
116 print("Parsing main structure...")
117 main_entries = parse_toc_from_html(index_html, 'index.html')
118 print(f"Found {len(main_entries)} main entries in index.html")
120 # Track which files we've processed to avoid duplicates
121 processed_files = {'index.html'}
123 # Collect all unique files to process
124 files_to_process = OrderedDict()
125 for entry in main_entries:
126 filename = entry['filename']
127 if filename and filename != 'index.html' and filename not in files_to_process:
128 files_to_process[filename] = entry
130 print(f"Found {len(files_to_process)} unique files to process")
135 def add_entry(entry, indent_level, parent_filename=None):
136 """Add entry to output with appropriate formatting."""
137 indent = ' ' * indent_level
139 filename = entry['filename']
141 # Normalize spacing in text (e.g., "1. What" -> "1. What")
142 text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
144 # Add link if this entry is on a different page than its parent
145 # or if it has no parent (top-level from index.html)
146 if parent_filename is None or filename != parent_filename:
147 # Entry on different page gets a link
148 output_lines.append(f"{indent}- [{text}](full-docs/html/{filename})\n")
150 # Entry on same page as parent is text-only
151 output_lines.append(f"{indent}- {text}\n")
153 # Process each unique file
154 # Use a while loop to allow dynamic addition of new files
155 print("\nProcessing files...")
157 files_list = list(files_to_process.items())
160 while file_index < len(files_list):
161 filename, main_entry = files_list[file_index]
165 file_path = os.path.join(docs_dir, filename)
167 if not os.path.exists(file_path):
168 print(f"[{processed_count}] SKIP {filename} (not found)")
171 print(f"[{processed_count}] {main_entry['text']}")
173 # Add the main entry from index.html (no parent, gets link)
174 add_entry(main_entry, indent_level=0, parent_filename=None)
176 # Read the file and extract its internal TOC
177 html_content = read_html_file(file_path)
179 internal_entries = parse_toc_from_html(html_content, filename)
181 # Add all internal entries (children of main entry)
182 for entry in internal_entries:
183 # Calculate indent: base level (1) + entry's internal level
184 indent_level = 1 + entry['level']
185 # Pass the main entry's filename as parent
186 add_entry(entry, indent_level, parent_filename=filename)
188 # If this entry points to a NEW file we haven't seen, add it to our queue
189 if not entry['is_same_file'] and entry['filename'] not in processed_files:
190 child_filename = entry['filename']
191 if child_filename not in files_to_process:
192 files_to_process[child_filename] = entry
193 files_list.append((child_filename, entry))
195 processed_files.add(filename)
197 print(f"\nTotal files processed: {len(processed_files)}")
198 print(f"Total output lines: {len(output_lines)}")
204 # Use paths relative to script location
205 script_dir = Path(__file__).parent
206 docs_dir = script_dir / 'full-docs' / 'html'
207 output_file = script_dir / 'full_toc.md'
210 print("Building Complete PostgreSQL 18 Documentation Table of Contents")
214 output_lines = build_complete_toc(docs_dir)
217 with open(output_file, 'w', encoding='utf-8') as f:
218 f.writelines(output_lines)
222 print(f"Complete TOC written to: {output_file}")
223 print(f"Total lines: {len(output_lines):,}")
227 print("Failed to build TOC")
231 if __name__ == '__main__':