#!/usr/bin/env python3
"""
Build a complete, detailed table of contents for PostgreSQL 18 documentation.

Simplified Algorithm:
1. Parse index.html to get complete hierarchical structure
2. For each file referenced, visit it ONCE and extract ALL TOC entries
3. Entries linking to SAME file are text-only (no link)
4. Entries linking to DIFFERENT files are clickable links
5. No recursive crawling - just process each file's internal TOC once
"""

import os
import re
from pathlib import Path
from lxml import html as lhtml
from collections import OrderedDict


def read_html_file(file_path):
    """Read HTML file content as bytes for lxml parsing."""
    try:
        with open(file_path, 'rb') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None


def extract_text_from_element(element):
    """Extract clean text from an lxml element, handling nested tags."""
    return ''.join(element.itertext()).strip()


def parse_toc_from_html(html_content, current_filename=None):
    """
    Parse TOC structure from HTML content.
    Returns list of dicts with: level, text, href, filename, is_same_file

    Args:
        html_content: The HTML content to parse
        current_filename: The filename being parsed (to detect same-file anchors)
    """
    entries = []

    try:
        tree = lhtml.fromstring(html_content)
        toc_divs = tree.xpath('.//div[@class="toc"]')

        if not toc_divs:
            return entries

        toc_div = toc_divs[0]

        # Find all TOC entries with various class types
        for span in toc_div.xpath('.//span[@class]'):
            span_class = span.get('class')

            # Determine entry type and level
            if span_class in ['preface', 'part', 'chapter', 'appendix', 'bibliography', 'index', 'reference']:
                level = 0
            elif span_class.startswith('sect'):
                match = re.match(r'sect(\d+)', span_class)
                if match:
                    level = int(match.group(1))
                else:
                    continue
            else:
                continue

            # Extract link and text
            links = span.xpath('.//a[@href]')
            if links:
                link = links[0]
                href = link.get('href', '')
                text = extract_text_from_element(link)

                if text and href and not href.startswith('http'):
                    # Extract filename (without anchor)
                    filename = href.split('#')[0] if '#' in href else href

                    # Determine if this is a same-file anchor
                    is_same_file = (filename == '' or filename == current_filename)

                    entries.append({
                        'level': level,
                        'text': text,
                        'href': href,
                        'filename': filename if filename else current_filename,
                        'is_same_file': is_same_file,
                        'class': span_class
                    })

    except Exception as e:
        print(f"Error parsing TOC: {e}")

    return entries


def build_complete_toc(docs_dir):
    """
    Build complete TOC by parsing index.html and then processing each referenced file once.
    """

    index_path = os.path.join(docs_dir, 'index.html')
    if not os.path.exists(index_path):
        print(f"Error: {index_path} not found")
        return None

    print("Reading index.html...")
    index_html = read_html_file(index_path)
    if not index_html:
        return None

    # Get main structure from index
    print("Parsing main structure...")
    main_entries = parse_toc_from_html(index_html, 'index.html')
    print(f"Found {len(main_entries)} main entries in index.html")

    # Track which files we've processed to avoid duplicates
    processed_files = {'index.html'}

    # Collect all unique files to process
    files_to_process = OrderedDict()
    for entry in main_entries:
        filename = entry['filename']
        if filename and filename != 'index.html' and filename not in files_to_process:
            files_to_process[filename] = entry

    print(f"Found {len(files_to_process)} unique files to process")

    # Build output
    output_lines = []

    def add_entry(entry, indent_level, parent_filename=None):
        """Add entry to output with appropriate formatting."""
        indent = '  ' * indent_level
        text = entry['text']
        filename = entry['filename']

        # Normalize spacing in text (e.g., "1.  What" -> "1. What")
        text = re.sub(r'(\d+)\.\s+', r'\1. ', text)

        # Add link if this entry is on a different page than its parent
        # or if it has no parent (top-level from index.html)
        if parent_filename is None or filename != parent_filename:
            # Entry on different page gets a link
            output_lines.append(f"{indent}- [{text}](full-docs/html/{filename})\n")
        else:
            # Entry on same page as parent is text-only
            output_lines.append(f"{indent}- {text}\n")

    # Process each unique file
    # Use a while loop to allow dynamic addition of new files
    print("\nProcessing files...")
    processed_count = 0
    files_list = list(files_to_process.items())
    file_index = 0

    while file_index < len(files_list):
        filename, main_entry = files_list[file_index]
        file_index += 1
        processed_count += 1

        file_path = os.path.join(docs_dir, filename)

        if not os.path.exists(file_path):
            print(f"[{processed_count}] SKIP {filename} (not found)")
            continue

        print(f"[{processed_count}] {main_entry['text']}")

        # Add the main entry from index.html (no parent, gets link)
        add_entry(main_entry, indent_level=0, parent_filename=None)

        # Read the file and extract its internal TOC
        html_content = read_html_file(file_path)
        if html_content:
            internal_entries = parse_toc_from_html(html_content, filename)

            # Add all internal entries (children of main entry)
            for entry in internal_entries:
                # Calculate indent: base level (1) + entry's internal level
                indent_level = 1 + entry['level']
                # Pass the main entry's filename as parent
                add_entry(entry, indent_level, parent_filename=filename)

                # If this entry points to a NEW file we haven't seen, add it to our queue
                if not entry['is_same_file'] and entry['filename'] not in processed_files:
                    child_filename = entry['filename']
                    if child_filename not in files_to_process:
                        files_to_process[child_filename] = entry
                        files_list.append((child_filename, entry))

        processed_files.add(filename)

    print(f"\nTotal files processed: {len(processed_files)}")
    print(f"Total output lines: {len(output_lines)}")

    return output_lines


def main():
    # Use paths relative to script location
    script_dir = Path(__file__).parent
    docs_dir = script_dir / 'full-docs' / 'html'
    output_file = script_dir / 'full_toc.md'

    print("=" * 70)
    print("Building Complete PostgreSQL 18 Documentation Table of Contents")
    print("=" * 70)
    print()

    output_lines = build_complete_toc(docs_dir)

    if output_lines:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.writelines(output_lines)

        print()
        print("=" * 70)
        print(f"Complete TOC written to: {output_file}")
        print(f"Total lines: {len(output_lines):,}")
        print("=" * 70)
        return 0
    else:
        print("Failed to build TOC")
        return 1


if __name__ == '__main__':
    exit(main())