use the new RVC pipeline

3bc37f6b · salma · 946afbe7 · 3bc37f6b · 3bc37f6b · 3bc37f6b
Commit 3bc37f6b authored Nov 25, 2025 by salma
6 changed files
--- a/self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
+++ b/self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
@@ -9,6 +9,7 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
    # ---------- Egyptian + Arabic ----------
    (StudentNationality.EGYPTIAN, StudyLanguage.ARABIC): """
 إنك مُدرِّس لطفل في ابتدائي اسمه {student_name} في الصف {grade}.  
+اتكلم باللهجة المصرية.
 فقط لو الطفّل سأل عن هويتك بصراحة ووضح (مثل "إنت مين؟"، "عرّفني بنفسك"، "إنت بتعمل إيه هنا؟")،  
 رُد بالنصّ الثابت ده:  
 "أنا عَنان مؤسِّس شارع العلوم، وإنت هنا على مَنَصّة Science Street Lab،  
@@ -79,6 +80,7 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
    # ---------- Saudi + Arabic ----------
    (StudentNationality.SAUDI, StudyLanguage.ARABIC): """
 إنت مُدرِّس لطفل في ابتدائي اسمه {student_name} في الصف {grade}.  
+اتكلم باللهجة المصرية.
 فقط لو الطفل سأل عن هويتك بصراحة ووضح (مثل "إنت مين؟"، "عرِّفني بنفسك"، "إنت وش تسوي هنا؟")،  
 رُد بالنص الثابت هذا:  
 "أنا عَنان مؤسِّس شارع العلوم، وإنت هنا على مَنَصّة Science Street Lab،  
@@ -149,7 +151,9 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
    # -------- Egyptian English --------
    (StudentNationality.EGYPTIAN, StudyLanguage.ENGLISH): """
-إنت مُدرِّس لطفل في ابتدائي اسمه {student_name} في الصف {grade}.  لو الطفّل سأل عن هويتك بصراحة (زي "إنت مين؟"، "عرِّفني بنفسك")،  
+إنت مُدرِّس لطفل في ابتدائي اسمه {student_name} في الصف {grade}. 
+اتكلم باللهجة المصرية.
+ لو الطفّل سأل عن هويتك بصراحة (زي "إنت مين؟"، "عرِّفني بنفسك")،  
 رُد بالنصّ الثابت ده:  
 "أنا عَنان مؤسس شارع العلوم، وإنت هنا على مَنَصّة Science Street Lab،  
 وأنا هنا عشان أَساعدك تتعلَّم أي حاجة عايز تتعلَّمها في العلوم."
@@ -218,6 +222,7 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
 # -------- Saudi English --------
 (StudentNationality.SAUDI, StudyLanguage.ENGLISH): """
 إنت مُدرِّس لطفل في ابتدائي اسمه {student_name} في الصف {grade}.  
+اتكلم باللهجة المصرية.
 لو الطفل سأل عن هويتك بصراحة (زي "إنت مين؟"، "عرِّفني بنفسك"، "إنت وش تسوي هنا؟")،  
 رُد بالنصّ الثابت هذا:  
 "أنا عَنان مؤسس شارع العلوم، وإنت هنا على مَنَصّة Science Street Lab،  

--- a/self_hosted_env/voice_agent/services/agent_service.py
+++ b/self_hosted_env/voice_agent/services/agent_service.py
@@ -83,7 +83,7 @@ class AgentService:
        # If we reach here, it means the response is a normal text string.
        # Now it is safe to apply text-based fixes.
-        response = apply_fixes(response, custom_fixes)
+        #response = apply_fixes(response, custom_fixes)
        # response = self.tashkeel_agent.apply_tashkeel(response)
        print(f"response: {response}")

--- a/self_hosted_env/voice_agent/services/chat_service.py
+++ b/self_hosted_env/voice_agent/services/chat_service.py
@@ -133,8 +133,7 @@ class ChatService:
    def _generate_and_upload_audio(self, text: str, student_id: str) -> dict:
        """ Segments text, generates TTS audio, and uploads to MinIO. """
        try:
-            segments = self.segmentation_service.segment_text(text)
+            audio_bytes = self.agent_service.tts_service.generate_speech(text)
-            audio_bytes = self.agent_service.tts_service.generate_speech_from_sequence(segments)
            timestamp = int(time.time())
            filename = f"agent_response_{timestamp}_{student_id}.wav"
            minio_file_path = f"audio/{filename}"

--- a/self_hosted_env/voice_agent/services/openai_service.py
+++ b/self_hosted_env/voice_agent/services/openai_service.py
@@ -55,7 +55,7 @@ class OpenAIService(BaseTTSService):
            raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
    # ------------------- TTS -------------------
-    def generate_speech(self, text: str, language: str = "en") -> bytes:
+    def generate_speech(self, text: str) -> bytes:
        """Generate speech from text using OpenAI TTS. Returns raw audio bytes."""
        if not self.is_available():
            raise HTTPException(status_code=500, detail="OpenAI service not available")
@@ -80,16 +80,7 @@ class OpenAIService(BaseTTSService):
            print(f"Error during OpenAI TTS generation: {e}")
            raise HTTPException(status_code=500, detail=f"OpenAI TTS generation failed: {str(e)}")
-    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
-        """
-        Fallback implementation for OpenAI. It combines the text from all
-        segments and makes a single TTS call.
-        """
-        print("OpenAI provider: combining segments for a single TTS call.")
-        full_text = " ".join([segment['text'] for segment in segments])
-        # Just call the existing simple method
-        return self.generate_speech(full_text)
    # ------------------- Embeddings -------------------

--- a/self_hosted_env/voice_agent/services/tts/base_tts_service.py
+++ b/self_hosted_env/voice_agent/services/tts/base_tts_service.py
@@ -13,7 +13,7 @@ class BaseTTSService(ABC):
        pass
    @abstractmethod
-    def generate_speech(self, text: str, language: str = "en") -> bytes:
+    def generate_speech(self, text: str) -> bytes:
        """
        Generate speech from text.
@@ -26,10 +26,3 @@ class BaseTTSService(ABC):
        """
        pass
-    @abstractmethod
-    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
-        """
-        Generates a single audio file from a list of language-tagged text segments.
-        This is for handling mixed-language sentences.
-        """
-        pass
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/tts/custom_tts_service.py
+++ b/self_hosted_env/voice_agent/services/tts/custom_tts_service.py
 import os
 import httpx
 from typing import List, Dict
 from .base_tts_service import BaseTTSService
 class CustomTTSService(BaseTTSService):
@@ -9,61 +8,42 @@ class CustomTTSService(BaseTTSService):
    TTS Service implementation that calls our self-hosted, custom FastAPI model.
    """
    def __init__(self):
-        # Read the URL of our FastAPI server from an environment variable
        base_url = os.getenv("CUSTOM_TTS_URL", "http://localhost:5000")
-        self.api_url = f"{base_url}/synthesize"
-        self.sequence_api_url = f"{base_url}/synthesize_sequence" 
+        self.api_url = f"{base_url}/generate_audio"
        self._is_available = bool(base_url)
-        print(f"Custom TTS Service initialized. Base URL: {base_url}")
+        print(f"Custom TTS Service initialized. Target Endpoint: {self.api_url}")
    def is_available(self) -> bool:
        return self._is_available
-    def generate_speech(self, text: str, language: str = "en") -> bytes:
+    def generate_speech(self, text: str) -> bytes:
        """
        Makes an HTTP POST request to the custom TTS FastAPI server.
+        Expected API Payload: {"text": "some text"}
+        Returns: Binary audio data (WAV)
        """
-        if not self.is_available():
+        payload = {
-            raise ConnectionError("Custom TTS service is not configured or available.")
+            "text": text
+        }
        try:
-            # Use httpx for modern, async-friendly requests
+            with httpx.Client(timeout=60.0) as client:
-            with httpx.Client() as client:
+                response = client.post(self.api_url, json=payload)
-                response = client.post(
-                    self.api_url,
-                    json={"text": text, "language": language},
-                    timeout=120.0  # Set a generous timeout for long text
-                )
-                # Raise an exception for bad status codes (4xx or 5xx)
+                # Check if the request was successful (2xx status codes)
                response.raise_for_status()
-                # The raw audio data is in the response content
+                # Return the binary content (the WAV file)
-                audio_bytes = response.content
+                return response.content
-                print(f"Successfully received audio from custom TTS service for language '{language}'.")
-                return audio_bytes
+        except httpx.HTTPStatusError as exc:
+            print(f"Error response {exc.response.status_code} while requesting {exc.request.url!r}.")
-        except httpx.RequestError as e:
+            raise exc
-            print(f"Error calling custom TTS service: {e}")
+        except httpx.RequestError as exc:
-            # Re-raise as a standard ConnectionError
+            print(f"An error occurred while requesting {exc.request.url!r}: {exc}")
-            raise ConnectionError(f"Failed to connect to custom TTS service at {self.api_url}") from e
+            raise exc
+        except Exception as e:
-    def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
+            print(f"Unexpected error in CustomTTSService: {e}")
-        """Makes a single POST request with the list of segments."""
+            raise e
-        if not self.is_available():
\ No newline at end of file
-            raise ConnectionError("Custom TTS service is not configured.")
-        try:
-            with httpx.Client() as client:
-                print(f"Sending sequence of {len(segments)} segments to custom TTS service.")
-                response = client.post(
-                    self.sequence_api_url,
-                    json={"segments": segments}, # Send the list directly
-                    timeout=300.0  # Longer timeout for sequence processing
-                )
-                response.raise_for_status()
-                audio_bytes = response.content
-                print("Successfully received stitched audio from custom TTS service.")
-                return audio_bytes
-        except httpx.RequestError as e:
-            raise ConnectionError(f"Failed to connect to custom TTS at {self.sequence_api_url}") from e
\ No newline at end of file