import nltk
import re
from num2words import num2words

def split_text_into_chunks(text: str, max_chars: int):
    """
    Splits a long text into smaller chunks based on sentence boundaries,
    ensuring no chunk exceeds the max_chars limit.
    """
    sentences = nltk.sent_tokenize(text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 <= max_chars:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            if len(sentence) > max_chars:
                words = sentence.split()
                temp_chunk = ""
                for word in words:
                    if len(temp_chunk) + len(word) + 1 <= max_chars:
                        temp_chunk += word + " "
                    else:
                        chunks.append(temp_chunk.strip())
                        temp_chunk = word + " "
                if temp_chunk:
                    chunks.append(temp_chunk.strip())
            else:
                current_chunk = sentence + " "
                
    if current_chunk:
        chunks.append(current_chunk.strip())
        
    return chunks

import re

def sanitize_text(text: str) -> str:
    """
    Cleans a string of text by removing Markdown, list markers, and other
    symbols that are not meant to be pronounced by a TTS model.
    """
    # 1. Remove Markdown headers (##, ###, etc.)
    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
    
    # 2. Remove Markdown bold/italic markers (asterisks and underscores)
    text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)  # **bold** or __bold__
    text = re.sub(r'(\*|_)(.*?)(\*|_)', r'\2', text)  # *italic* or _italic_
    
    # 3. Remove list item markers (like -, *, +, or numbered lists) at the beginning of a line
    text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE)  # Unordered lists
    text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)  # Numbered lists (1. 2. 3.)
    
    # 4. Remove inline code markers (backticks)
    text = re.sub(r'`([^`]+)`', r'\1', text)  # `code`
    text = re.sub(r'```[^\n]*\n(.*?)```', r'\1', text, flags=re.DOTALL)  # ```code blocks```
    
    # 5. Remove links but keep the text
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # [text](url) -> text
    text = re.sub(r'<([^>]+)>', r'\1', text)  # <url> -> url
    
    # 6. Remove images
    text = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', '', text)  # ![alt](url)
    
    # 7. Remove horizontal rules
    text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
    
    # 8. Remove blockquote markers
    text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
    
    # 9. Handle colons: replace with period for natural pause
    text = text.replace(':', '.')
    
    # 10. Remove brackets, parentheses, and other special characters
    text = re.sub(r'[\[\]\(\){}]', '', text)
    
    # 11. Remove any remaining isolated hash symbols or special markdown characters
    text = re.sub(r'#+', '', text)  # Remove any # characters
    text = re.sub(r'[~^]', '', text)  # Remove strikethrough and other markers
    
    # 12. Clean up multiple periods or punctuation
    text = re.sub(r'\.{2,}', '.', text)  # Multiple periods -> single period
    text = re.sub(r'[.!?]{2,}', '.', text)  # Multiple punctuation -> single period
    
    # 13. Normalize whitespace to a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 14. Remove leading/trailing periods that might result from the cleaning
    text = re.sub(r'^\.\s*|\s*\.$', '', text).strip()
    
    return text

SYMBOLS = {
    # Arrows & Chemistry
    r"\rightarrow": {"en": ", yields, ", "ar": "، ينتج، "},
    r"\leftrightarrow": {"en": ", in equilibrium with, ", "ar": "، في حالة اتزان مع، "},
    
    # Basic Arithmetic
    "+": {"en": " plus ", "ar": " زائد "},
    "-": {"en": " minus ", "ar": " ناقص "},
    
    # --- ADDED MULTIPLICATION SYMBOLS ---
    "*": {"en": " times ", "ar": " في "},
    "×": {"en": " times ", "ar": " في "}, # Formal multiplication sign
    r"\cdot": {"en": " times ", "ar": " في "}, # This was already here

    # --- ADDED DIVISION SYMBOLS ---
    "/": {"en": " divided by ", "ar": " على "},
    "÷": {"en": " divided by ", "ar": " على "}, # Formal division sign

    # Equality
    "=": {"en": ", equals, ", "ar": "، يساوي، "},
}


def expand_element(element: str, lang: str = "en") -> str:
    """
    Takes a chemical element string (e.g., 'CO2', 'H2O') and expands it
    with spaces and converts numbers to words.
    - 'CO2' becomes 'C O two'
    - 'H2O' becomes 'H two O'
    """
    # Only process if it looks like a chemical formula (contains letters and possibly numbers)
    if not re.search(r'[A-Za-z]', element):
        return element
    
    # 1. Add spaces between adjacent letters and between letters/numbers.
    spaced_element = re.sub(r"([A-Za-z])(?=[A-Za-z\d])", r"\1 ", element)
    spaced_element = re.sub(r"(\d)(?=[A-Za-z])", r"\1 ", spaced_element)

    # 2. Convert all numbers in the resulting string to words.
    spoken_element = re.sub(r"(\d+)", lambda m: num2words(int(m.group(1)), lang=lang), spaced_element)
    
    return spoken_element


def equation_to_speech_single(equation: str, lang: str = "en") -> str:
    """Converts a single LaTeX or plain text equation into a pronounceable string."""
    # Debug: print the equation and check for backslashes
    print(f"DEBUG equation_to_speech_single input: '{equation}'")
    print(f"DEBUG equation bytes: {equation.encode('unicode_escape').decode('ascii')}")
    
    # Handle both raw backslash and escape sequences
    # Replace escape sequences that might have been interpreted
    equation = equation.replace('\r', '\\r')  # Fix carriage return back to \r
    equation = equation.replace('\n', '\\n')  # Fix newline back to \n
    equation = equation.replace('\t', '\\t')  # Fix tab back to \t
    
    print(f"DEBUG after escape fix: {repr(equation)}")
    
    # 1. FIRST: Replace LaTeX symbols with pronounceable text (before any other processing)
    for symbol, replacement in SYMBOLS.items():
        if symbol in equation:
            print(f"DEBUG: Found and replacing symbol: {symbol}")
        equation = equation.replace(symbol, replacement[lang])
    
    # 2. Clean up LaTeX structural commands
    equation = re.sub(r"\\frac{([^}]+)}{([^}]+)}", lambda m: f"{m.group(1)} over {m.group(2)}", equation)
    equation = re.sub(r"_\{([^}]+)\}", r"\1", equation)  # Remove subscripts, e.g., H_{2} -> H2
    equation = re.sub(r"\^\{([^}]+)\}", r"\1", equation)  # Remove superscripts
    equation = equation.replace("{", "").replace("}", "")
    
    # 3. Split by the replacement text to preserve operators
    # We need to identify chemical formulas vs operators
    parts = []
    current = ""
    i = 0
    
    while i < len(equation):
        # Check if we're at the start of an operator phrase
        found_operator = False
        for symbol_text in [" plus ", " minus ", ", equals, ", ", yields, ", ", in equilibrium with, ", " times "]:
            if equation[i:].startswith(symbol_text):
                if current.strip():
                    parts.append(("formula", current.strip()))
                    current = ""
                parts.append(("operator", symbol_text.strip()))
                i += len(symbol_text)
                found_operator = True
                break
        
        if not found_operator:
            current += equation[i]
            i += 1
    
    if current.strip():
        parts.append(("formula", current.strip()))
    
    # 4. Process each part appropriately
    result = []
    for part_type, content in parts:
        if part_type == "operator":
            result.append(content)
        else:
            # This is a chemical formula or number, expand it
            result.append(expand_element(content, lang))
    
    return " ".join(result)


def translate_equations_in_text(text: str, lang: str = "en") -> str:
    """Finds all LaTeX equations in a text block and replaces them."""
    # This regex finds content inside $, $$, or \[...\]
    pattern = re.compile(r"\$([^$]+)\$|\\\[([^]]+)\\\]")

    def replacer(match):
        # The content is in either the first or second capture group
        content = match.group(1) or match.group(2)
        if content:
            return equation_to_speech_single(content, lang)
        return match.group(0)

    return pattern.sub(replacer, text)