annan voice woring

fff980cc · SalmaMohammedHamedMustafa · 007ab75f · fff980cc · fff980cc · fff980cc
Commit fff980cc authored Oct 01, 2025 by SalmaMohammedHamedMustafa
10 changed files
--- a/self_hosted_env/voice_agent/main.py
+++ b/self_hosted_env/voice_agent/main.py
@@ -14,7 +14,7 @@ from repositories import StorageRepository, MinIOStorageRepository
 from handlers import AudioMessageHandler, TextMessageHandler
 from services import (
    AudioService, ChatService, HealthService, ResponseService, 
-    ResponseManager, OpenAIService, AgentService, ConnectionPool, PGVectorService, ChatDatabaseService
+    ResponseManager, OpenAIService, AgentService, ConnectionPool, PGVectorService, ChatDatabaseService, LanguageSegmentationService
 )

 class DIContainer:
@@ -38,12 +38,14 @@ class DIContainer:

        # Initialize services
        self.audio_service = AudioService(self.storage_repo, self.config.minio_bucket)
+        self.segmentation_service = LanguageSegmentationService()
        self.chat_service = ChatService(
            self.storage_repo, 
            self.response_manager, 
            self.config,
            self.openai_service,
-            self.agent_service
+            self.agent_service,
+            self.segmentation_service
        )
        self.response_service = ResponseService(self.response_manager, self.audio_service)
        self.health_service = HealthService(self.storage_repo, self.config)

--- a/self_hosted_env/voice_agent/requirements.txt
+++ b/self_hosted_env/voice_agent/requirements.txt
@@ -11,3 +11,4 @@ pgvector
 pandas
 python-dotenv
 httpx
+langdetect
--- a/self_hosted_env/voice_agent/services/__init__.py
+++ b/self_hosted_env/voice_agent/services/__init__.py
@@ -9,3 +9,4 @@ from .pgvector_service import PGVectorService
 from .chat_database_service import ChatDatabaseService
 from .connection_pool import ConnectionPool
 from .pedagogy_service import PedagogyService
+from .segmentation_service import LanguageSegmentationService
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
+++ b/self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
@@ -16,6 +16,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
 ⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية. 
 ⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.  

+⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في  اللَّهجَة المَصريّة.
+مفيش كَلِمَة تِتْسَاب من غير تشكيل.  
+
+أمثلة:  
+* "إزّيك؟" → "إِزَّيَّك؟"  
+* "عامل إيه؟" → "عَامِلْ إِيهْ؟"  
+* "أنا ماشي" → "أَنَا مَاشِي"  
+* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"  
+
+اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.

 لو سأل أسئلة عامة زي "نت عارف انا مين؟" أو "إزيك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.

@@ -106,6 +116,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
 ⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
 ⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.  

+⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في  اللَّهجَة المَصريّة.
+مفيش كَلِمَة تِتْسَاب من غير تشكيل.  
+
+أمثلة:  
+* "إزّيك؟" → "إِزَّيَّك؟"  
+* "عامل إيه؟" → "عَامِلْ إِيهْ؟"  
+* "أنا ماشي" → "أَنَا مَاشِي"  
+* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"  
+
+اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.


 لو سأل أسئلة عامة مثل "نت عارف انا مين؟" أو "كيفك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.

--- a/self_hosted_env/voice_agent/services/chat_service.py
+++ b/self_hosted_env/voice_agent/services/chat_service.py
@@ -12,10 +12,11 @@ from repositories import StorageRepository
 from services.response_manager import ResponseManager
 from services.openai_service import OpenAIService
 from services.agent_service import AgentService
+from services.segmentation_service import LanguageSegmentationService

 class ChatService:
    def __init__(self, storage_repo: StorageRepository, response_manager: ResponseManager, 
-                 config: AppConfig, openai_service: OpenAIService, agent_service: AgentService):
+                 config: AppConfig, openai_service: OpenAIService, agent_service: AgentService, segmentation_service: LanguageSegmentationService):
        from handlers import AudioMessageHandler, TextMessageHandler
        
        self.storage_repo = storage_repo
@@ -23,12 +24,15 @@ class ChatService:
        self.config = config
        self.openai_service = openai_service
        self.agent_service = agent_service
+        self.segmentation_service = segmentation_service
        
        self.handlers = {
            MessageType.AUDIO: AudioMessageHandler(storage_repo, config.minio_bucket, openai_service),
            MessageType.TEXT: TextMessageHandler()
        }

+        
+
    def process_message(self, student_id: str, file: Optional[UploadFile] = None, text: Optional[str] = None):
        """Process message and generate text and audio response."""
        self.response_manager.clear_response()
@@ -46,10 +50,8 @@ class ChatService:
                student_id=student_id,
            )

-            # --- MODIFIED: Call the audio generation method ---
            audio_data = self._generate_and_upload_audio(agent_response_text, student_id)
            
-            # --- FIXED: Use the correct 'store_response' method name ---
            self.response_manager.store_response(
                text=agent_response_text,
                audio_filename=audio_data.get("filename"),
@@ -72,34 +74,33 @@ class ChatService:

    def _generate_and_upload_audio(self, text: str, student_id: str) -> dict:
        """
-        Generates TTS audio and uploads the resulting audio bytes directly to MinIO.
+        Segments mixed-language text and generates TTS audio using the pluggable
+        AgentService, then uploads the final audio to MinIO.
        """
        try:
-            student_info = self.agent_service.db_service.get_student_info(student_id)
-            if not student_info:
-                raise ValueError(f"Could not find student {student_id} for TTS.")
-            language = "ar" if student_info.get('is_arabic') else "en"
+            # 1. Segment the text into language-tagged parts
+            segments = self.segmentation_service.segment_text(text)

-            audio_bytes = self.agent_service.text_to_speech(text, language)
+            # 2. Generate a single, stitched audio file from the sequence
+            # This call will be routed correctly by the tts_manager
+            audio_bytes = self.agent_service.tts_service.generate_speech_from_sequence(segments)

+            # 3. Determine filename and upload (same as before)
            provider = os.getenv("TTS_PROVIDER", "openai").lower()
            file_extension = "wav" if provider == "custom" else "mp3"
+            content_type = "audio/wav" if provider == "custom" else "audio/mpeg"
            
            timestamp = int(time.time())
            filename = f"agent_response_{timestamp}.{file_extension}"
            minio_file_path = f"audio/{filename}"

-            print(f"Uploading audio to MinIO: {minio_file_path}")
-
-            # --- FIXED: Call the upload method with the correct argument names ---
-            # Your MinIO repo uses 'upload_fileobj' which matches this call.
            self.storage_repo.upload_file(
                file_obj=io.BytesIO(audio_bytes),
                bucket=self.config.minio_bucket,
                file_path=minio_file_path
            )
            
-            print(f"Successfully generated and uploaded TTS audio: {filename}")
+            print(f"Successfully generated and uploaded stitched TTS audio: {filename}")
            return {"bytes": audio_bytes, "filename": filename}
            
        except Exception as e:

--- a/self_hosted_env/voice_agent/services/openai_service.py
+++ b/self_hosted_env/voice_agent/services/openai_service.py
@@ -2,7 +2,7 @@ import os
 import time
 import tempfile
 import io
-from typing import Optional, List
+from typing import Optional, List, Dict
 from fastapi import HTTPException
 from openai import OpenAI
 import sys
@@ -80,6 +80,18 @@ class OpenAIService(BaseTTSService):
            print(f"Error during OpenAI TTS generation: {e}")
            raise HTTPException(status_code=500, detail=f"OpenAI TTS generation failed: {str(e)}")
    
+    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
+        """
+        Fallback implementation for OpenAI. It combines the text from all
+        segments and makes a single TTS call.
+        """
+        print("OpenAI provider: combining segments for a single TTS call.")
+        full_text = " ".join([segment['text'] for segment in segments])
+        
+        # Just call the existing simple method
+        return self.generate_speech(full_text)
+
+
    # ------------------- Embeddings -------------------
    def generate_embedding(self, text: str) -> List[float]:
        """
@@ -118,3 +130,6 @@ class OpenAIService(BaseTTSService):
                print(f"Cleaned up temporary file: {file_path}")
            except Exception as e:
                print(f"Warning: Could not clean up temp file {file_path}: {e}")
+
+
+    
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/segmentation_service.py
+++ b/self_hosted_env/voice_agent/services/segmentation_service.py
+from langdetect import detect
+from typing import List, Dict
+
+class LanguageSegmentationService:
+    """
+    A service to segment a string of text into a list of dictionaries,
+    each tagged with its detected language.
+    """
+    def segment_text(self, text: str) -> List[Dict[str, str]]:
+        """
+        Takes a mixed-language string and splits it into segments.
+
+        Example:
+        Input: "هذا هو a test of the system."
+        Output: [
+            {'text': 'هذا هو', 'language': 'ar'},
+            {'text': 'a test of the system.', 'language': 'en'}
+        ]
+        """
+        segments = []
+        if not text:
+            return segments
+
+        words = text.split()
+        if not words:
+            return segments
+
+        # Start with the language of the first word
+        current_lang = self._detect_word_language(words[0])
+        current_segment = []
+
+        for word in words:
+            word_lang = self._detect_word_language(word)
+            
+            if word_lang == current_lang:
+                # If the language is the same, add the word to the current segment
+                current_segment.append(word)
+            else:
+                # If the language changes, finalize the previous segment
+                if current_segment:
+                    segments.append({
+                        "text": " ".join(current_segment),
+                        "language": current_lang
+                    })
+                
+                # Start a new segment with the new word and language
+                current_lang = word_lang
+                current_segment = [word]
+
+        # Add the final remaining segment
+        if current_segment:
+            segments.append({
+                "text": " ".join(current_segment),
+                "language": current_lang
+            })
+            
+        print(f"Segmented text into {len(segments)} parts.")
+        return segments
+
+    def _detect_word_language(self, word: str) -> str:
+        """Detects language of a single word, defaulting to 'en' for ambiguity."""
+        # Simple heuristic: if it contains any Arabic characters, it's Arabic.
+        if any('\u0600' <= char <= '\u06FF' for char in word):
+            return "ar"
+        
+        # For non-Arabic words, we can assume English
+        return "en"
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/tts/base_tts_service.py
+++ b/self_hosted_env/voice_agent/services/tts/base_tts_service.py
 from abc import ABC, abstractmethod
+from typing import List, Dict

 class BaseTTSService(ABC):
    """
@@ -23,4 +24,12 @@ class BaseTTSService(ABC):
        Returns:
            bytes: The raw audio data of the speech (e.g., in WAV or MP3 format).
        """
+        pass
+    
+    @abstractmethod
+    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
+        """
+        Generates a single audio file from a list of language-tagged text segments.
+        This is for handling mixed-language sentences.
+        """
        pass
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/tts/custom_tts_service.py
+++ b/self_hosted_env/voice_agent/services/tts/custom_tts_service.py
 import os
 import httpx
+from typing import List, Dict
+
 from .base_tts_service import BaseTTSService

 class CustomTTSService(BaseTTSService):
@@ -8,9 +10,11 @@ class CustomTTSService(BaseTTSService):
    """
    def __init__(self):
        # Read the URL of our FastAPI server from an environment variable
-        self.api_url = os.getenv("CUSTOM_TTS_URL", "http://localhost:5000/synthesize")
-        self._is_available = bool(self.api_url)
-        print(f"Custom TTS Service initialized. API URL: {self.api_url}")
+        base_url = os.getenv("CUSTOM_TTS_URL", "http://localhost:5000")
+        self.api_url = f"{base_url}/synthesize"
+        self.sequence_api_url = f"{base_url}/synthesize_sequence" 
+        self._is_available = bool(base_url)
+        print(f"Custom TTS Service initialized. Base URL: {base_url}")

    def is_available(self) -> bool:
        return self._is_available
@@ -42,4 +46,24 @@ class CustomTTSService(BaseTTSService):
        except httpx.RequestError as e:
            print(f"Error calling custom TTS service: {e}")
            # Re-raise as a standard ConnectionError
-            raise ConnectionError(f"Failed to connect to custom TTS service at {self.api_url}") from e
\ No newline at end of file
+            raise ConnectionError(f"Failed to connect to custom TTS service at {self.api_url}") from e
+
+    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
+        """Makes a single POST request with the list of segments."""
+        if not self.is_available():
+            raise ConnectionError("Custom TTS service is not configured.")
+
+        try:
+            with httpx.Client() as client:
+                print(f"Sending sequence of {len(segments)} segments to custom TTS service.")
+                response = client.post(
+                    self.sequence_api_url,
+                    json={"segments": segments}, # Send the list directly
+                    timeout=300.0  # Longer timeout for sequence processing
+                )
+                response.raise_for_status()
+                audio_bytes = response.content
+                print("Successfully received stitched audio from custom TTS service.")
+                return audio_bytes
+        except httpx.RequestError as e:
+            raise ConnectionError(f"Failed to connect to custom TTS at {self.sequence_api_url}") from e
\ No newline at end of file
--- a/self_hosted_env/voice_agent/voice_agent.tar
+++ b/self_hosted_env/voice_agent/voice_agent.tar