import re
from num2words import num2words

"""
This module processes text containing LaTeX equations and mathematical symbols,
translating them into spoken forms suitable for text-to-speech synthesis in both
English and Arabic. It includes smart language detection based on surrounding text context.
"""

# --- 1. Symbols Dictionary ---
SYMBOLS = {
    # 1. LaTeX Commands
    r"\\rightarrow": {"en": " yields ", "ar": " ينتج "},
    r"\\leftrightarrow": {"en": " in equilibrium with ", "ar": " في حالة اتزان مع "},
    r"\\cdot": {"en": " times ", "ar": " في "},
    r"\\times": {"en": " times ", "ar": " في "},
    r"\\div": {"en": " divided by ", "ar": " على "},
    
    # 2. Raw Unicode Arrows (ADDED THIS)
    r"→": {"en": " yields ", "ar": " ينتج "},
    r"↔": {"en": " in equilibrium with ", "ar": " في حالة اتزان مع "},
    r"=>": {"en": " yields ", "ar": " ينتج "}, # Just in case
    
    # 3. Basic Math Symbols
    r"\+": {"en": " plus ", "ar": " زائد "},
    r"-": {"en": " minus ", "ar": " ناقص "},
    r"\*": {"en": " times ", "ar": " في "},
    r"×": {"en": " times ", "ar": " في "},
    r"/": {"en": " divided by ", "ar": " على "},
    r"÷": {"en": " divided by ", "ar": " على "},
    r"=": {"en": " equals ", "ar": " يساوي "},
}

# --- 2. Smart Language Detection ---
def detect_context_language(text: str, fallback="en") -> str:
    # Remove non-letters for accurate counting
    clean_text = re.sub(r'[0-9\s\W]', '', text)
    
    arabic_count = len(re.findall(r'[\u0600-\u06FF]', clean_text))
    english_count = len(re.findall(r'[a-zA-Z]', clean_text))
    
    if arabic_count == 0 and english_count == 0:
        return fallback
        
    if english_count >= arabic_count:
        return "en"
    else:
        return "ar"

# --- 3. Processing Functions ---

def process_latex_match(match):
    """Handles LaTeX blocks like $...$"""
    content = match.group(1) or match.group(2)
    if not content: return match.group(0)
    
    lang = detect_context_language(content, fallback="en")
    
    content = content.replace('{', ' ').replace('}', ' ')
    over_word = " over " if lang == "en" else " على "
    content = re.sub(r"\\frac\s*(\S+)\s*(\S+)", f"\\1{over_word}\\2", content)
    
    for pattern, replacement in SYMBOLS.items():
        clean_pat = pattern.replace("\\\\", "\\") 
        if clean_pat in content:
             content = content.replace(clean_pat, replacement[lang])

    return content

def process_raw_symbols(text: str) -> str:
    """
    Finds math symbols outside of LaTeX and replaces them.
    """
    # Loop through dictionary
    for pattern, replacement in SYMBOLS.items():
        if pattern.startswith(r"\\"): continue
        
        regex = re.compile(f"({pattern})")
        parts = regex.split(text)
        output = []
        
        for i, part in enumerate(parts):
            if re.match(regex, part):
                # INCREASED WINDOW TO 50 chars
                # This helps it see "معادلة أكسيد" even if the equation is long
                prev_chunk = parts[i-1][-50:] if i > 0 else ""
                next_chunk = parts[i+1][:50] if i < len(parts)-1 else ""
                
                context = prev_chunk + next_chunk
                
                # Default to Arabic (ar) if unsure, since your content seems mostly Arabic
                lang = detect_context_language(context, fallback="ar") 
                
                output.append(replacement[lang])
            else:
                output.append(part)
        
        text = "".join(output)
        
    return text

def sanitize_text(text: str) -> str:
    text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)
    text = re.sub(r'(\*\*|__)(.*?)(\*\*|__)', r'\2', text)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', text)
    text = re.sub(r'[\[\]\(\){}]', ' ', text)
    text = text.replace('_', ' ')
    
    # Hyphen logic: Replace "-" with comma if not between numbers/letters
    text = re.sub(r'(?<![a-zA-Z0-9])\s*-\s*', '، ', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- MASTER FUNCTION ---
def prepare_text_for_audio(text: str) -> str:
    # 1. Sanitize
    text = sanitize_text(text)
    
    # 2. Handle LaTeX Equations
    text = re.sub(r"\$([^$]+)\$|\\\[([^]]+)\\\]", process_latex_match, text)
    
    # 3. Handle Raw Symbols (Now includes Unicode →)
    text = process_raw_symbols(text)
    
    return text