from langdetect import detect
from typing import List, Dict
import re

class LanguageSegmentationService:
    """
    A service to segment a string of text into a list of dictionaries,
    each tagged with its detected language.
    """
    
    def segment_text(self, text: str) -> List[Dict[str, str]]:
        """
        Takes a mixed-language string and splits it into segments.
        Example:
            Input: "هذا هو a test of the system."
            Output: [
                {'text': 'هذا هو', 'language': 'ar'},
                {'text': 'a test of the system.', 'language': 'en'}
            ]
        """
        segments = []
        if not text:
            return segments
        
        words = text.split()
        if not words:
            return segments
        
        # Start with the language of the first word
        current_lang = self._detect_word_language(words[0])
        current_segment = []
        
        for word in words:
            word_lang = self._detect_word_language(word)
            
            # Check if this is a "neutral" token (numbers, punctuation, special markers)
            is_neutral = self._is_neutral_token(word)
            
            if is_neutral:
                # Neutral tokens stay with the current segment
                current_segment.append(word)
            elif word_lang == current_lang:
                # If the language is the same, add the word to the current segment
                current_segment.append(word)
            else:
                # If the language changes, finalize the previous segment
                if current_segment:
                    segments.append({
                        "text": " ".join(current_segment),
                        "language": current_lang
                    })
                # Start a new segment with the new word and language
                current_lang = word_lang
                current_segment = [word]
        
        # Add the final remaining segment
        if current_segment:
            segments.append({
                "text": " ".join(current_segment),
                "language": current_lang
            })
        
        print(f"Segmented text into {len(segments)} parts.")
        return segments
    
    def _is_neutral_token(self, word: str) -> bool:
        """
        Check if a token is 'neutral' (numbers, punctuation, special markers).
        These should stick with the current segment rather than create a new one.
        """
        # Strip common punctuation to check the core content
        stripped = word.strip('.,!?;:()[]{}"\'-')
        
        # Empty after stripping (pure punctuation)
        if not stripped:
            return True
        
        # Check if it's a number (including decimals like 1.2, 3.14, etc.)
        # Remove # and check if what remains is numeric (allowing dots for decimals)
        cleaned = stripped.replace('#', '')
        
        # Check for section/chapter numbering like 1.2.3 or 1.2 or just 1
        # Pattern: digits separated by dots or commas
        if re.match(r'^[\d.]+$', cleaned) and any(c.isdigit() for c in cleaned):
            return True
        
        # Also handle numbers with commas (like 1,000)
        if re.match(r'^[\d,]+$', cleaned) and any(c.isdigit() for c in cleaned):
            return True
        
        # Special markdown-like markers (##, ###, etc.)
        if all(c == '#' for c in stripped):
            return True
        
        return False
    
    def _detect_word_language(self, word: str) -> str:
        """Detects language of a single word, defaulting to 'en' for ambiguity."""
        # Simple heuristic: if it contains any Arabic characters, it's Arabic.
        if any('\u0600' <= char <= '\u06FF' for char in word):
            return "ar"
        # For non-Arabic words, we can assume English
        return "en"