annan voice woring

parent 007ab75f
......@@ -14,7 +14,7 @@ from repositories import StorageRepository, MinIOStorageRepository
from handlers import AudioMessageHandler, TextMessageHandler
from services import (
AudioService, ChatService, HealthService, ResponseService,
ResponseManager, OpenAIService, AgentService, ConnectionPool, PGVectorService, ChatDatabaseService
ResponseManager, OpenAIService, AgentService, ConnectionPool, PGVectorService, ChatDatabaseService, LanguageSegmentationService
)
class DIContainer:
......@@ -38,12 +38,14 @@ class DIContainer:
# Initialize services
self.audio_service = AudioService(self.storage_repo, self.config.minio_bucket)
self.segmentation_service = LanguageSegmentationService()
self.chat_service = ChatService(
self.storage_repo,
self.response_manager,
self.config,
self.openai_service,
self.agent_service
self.agent_service,
self.segmentation_service
)
self.response_service = ResponseService(self.response_manager, self.audio_service)
self.health_service = HealthService(self.storage_repo, self.config)
......
......@@ -11,3 +11,4 @@ pgvector
pandas
python-dotenv
httpx
langdetect
......@@ -9,3 +9,4 @@ from .pgvector_service import PGVectorService
from .chat_database_service import ChatDatabaseService
from .connection_pool import ConnectionPool
from .pedagogy_service import PedagogyService
from .segmentation_service import LanguageSegmentationService
\ No newline at end of file
......@@ -16,6 +16,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة زي "نت عارف انا مين؟" أو "إزيك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
......@@ -106,6 +116,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة مثل "نت عارف انا مين؟" أو "كيفك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
......
......@@ -12,10 +12,11 @@ from repositories import StorageRepository
from services.response_manager import ResponseManager
from services.openai_service import OpenAIService
from services.agent_service import AgentService
from services.segmentation_service import LanguageSegmentationService
class ChatService:
def __init__(self, storage_repo: StorageRepository, response_manager: ResponseManager,
config: AppConfig, openai_service: OpenAIService, agent_service: AgentService):
config: AppConfig, openai_service: OpenAIService, agent_service: AgentService, segmentation_service: LanguageSegmentationService):
from handlers import AudioMessageHandler, TextMessageHandler
self.storage_repo = storage_repo
......@@ -23,12 +24,15 @@ class ChatService:
self.config = config
self.openai_service = openai_service
self.agent_service = agent_service
self.segmentation_service = segmentation_service
self.handlers = {
MessageType.AUDIO: AudioMessageHandler(storage_repo, config.minio_bucket, openai_service),
MessageType.TEXT: TextMessageHandler()
}
def process_message(self, student_id: str, file: Optional[UploadFile] = None, text: Optional[str] = None):
"""Process message and generate text and audio response."""
self.response_manager.clear_response()
......@@ -46,10 +50,8 @@ class ChatService:
student_id=student_id,
)
# --- MODIFIED: Call the audio generation method ---
audio_data = self._generate_and_upload_audio(agent_response_text, student_id)
# --- FIXED: Use the correct 'store_response' method name ---
self.response_manager.store_response(
text=agent_response_text,
audio_filename=audio_data.get("filename"),
......@@ -72,34 +74,33 @@ class ChatService:
def _generate_and_upload_audio(self, text: str, student_id: str) -> dict:
"""
Generates TTS audio and uploads the resulting audio bytes directly to MinIO.
Segments mixed-language text and generates TTS audio using the pluggable
AgentService, then uploads the final audio to MinIO.
"""
try:
student_info = self.agent_service.db_service.get_student_info(student_id)
if not student_info:
raise ValueError(f"Could not find student {student_id} for TTS.")
language = "ar" if student_info.get('is_arabic') else "en"
# 1. Segment the text into language-tagged parts
segments = self.segmentation_service.segment_text(text)
audio_bytes = self.agent_service.text_to_speech(text, language)
# 2. Generate a single, stitched audio file from the sequence
# This call will be routed correctly by the tts_manager
audio_bytes = self.agent_service.tts_service.generate_speech_from_sequence(segments)
# 3. Determine filename and upload (same as before)
provider = os.getenv("TTS_PROVIDER", "openai").lower()
file_extension = "wav" if provider == "custom" else "mp3"
content_type = "audio/wav" if provider == "custom" else "audio/mpeg"
timestamp = int(time.time())
filename = f"agent_response_{timestamp}.{file_extension}"
minio_file_path = f"audio/{filename}"
print(f"Uploading audio to MinIO: {minio_file_path}")
# --- FIXED: Call the upload method with the correct argument names ---
# Your MinIO repo uses 'upload_fileobj' which matches this call.
self.storage_repo.upload_file(
file_obj=io.BytesIO(audio_bytes),
bucket=self.config.minio_bucket,
file_path=minio_file_path
)
print(f"Successfully generated and uploaded TTS audio: {filename}")
print(f"Successfully generated and uploaded stitched TTS audio: {filename}")
return {"bytes": audio_bytes, "filename": filename}
except Exception as e:
......
......@@ -2,7 +2,7 @@ import os
import time
import tempfile
import io
from typing import Optional, List
from typing import Optional, List, Dict
from fastapi import HTTPException
from openai import OpenAI
import sys
......@@ -80,6 +80,18 @@ class OpenAIService(BaseTTSService):
print(f"Error during OpenAI TTS generation: {e}")
raise HTTPException(status_code=500, detail=f"OpenAI TTS generation failed: {str(e)}")
def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
"""
Fallback implementation for OpenAI. It combines the text from all
segments and makes a single TTS call.
"""
print("OpenAI provider: combining segments for a single TTS call.")
full_text = " ".join([segment['text'] for segment in segments])
# Just call the existing simple method
return self.generate_speech(full_text)
# ------------------- Embeddings -------------------
def generate_embedding(self, text: str) -> List[float]:
"""
......@@ -118,3 +130,6 @@ class OpenAIService(BaseTTSService):
print(f"Cleaned up temporary file: {file_path}")
except Exception as e:
print(f"Warning: Could not clean up temp file {file_path}: {e}")
\ No newline at end of file
from langdetect import detect
from typing import List, Dict
class LanguageSegmentationService:
"""
A service to segment a string of text into a list of dictionaries,
each tagged with its detected language.
"""
def segment_text(self, text: str) -> List[Dict[str, str]]:
"""
Takes a mixed-language string and splits it into segments.
Example:
Input: "هذا هو a test of the system."
Output: [
{'text': 'هذا هو', 'language': 'ar'},
{'text': 'a test of the system.', 'language': 'en'}
]
"""
segments = []
if not text:
return segments
words = text.split()
if not words:
return segments
# Start with the language of the first word
current_lang = self._detect_word_language(words[0])
current_segment = []
for word in words:
word_lang = self._detect_word_language(word)
if word_lang == current_lang:
# If the language is the same, add the word to the current segment
current_segment.append(word)
else:
# If the language changes, finalize the previous segment
if current_segment:
segments.append({
"text": " ".join(current_segment),
"language": current_lang
})
# Start a new segment with the new word and language
current_lang = word_lang
current_segment = [word]
# Add the final remaining segment
if current_segment:
segments.append({
"text": " ".join(current_segment),
"language": current_lang
})
print(f"Segmented text into {len(segments)} parts.")
return segments
def _detect_word_language(self, word: str) -> str:
"""Detects language of a single word, defaulting to 'en' for ambiguity."""
# Simple heuristic: if it contains any Arabic characters, it's Arabic.
if any('\u0600' <= char <= '\u06FF' for char in word):
return "ar"
# For non-Arabic words, we can assume English
return "en"
\ No newline at end of file
from abc import ABC, abstractmethod
from typing import List, Dict
class BaseTTSService(ABC):
"""
......@@ -23,4 +24,12 @@ class BaseTTSService(ABC):
Returns:
bytes: The raw audio data of the speech (e.g., in WAV or MP3 format).
"""
pass
@abstractmethod
def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
"""
Generates a single audio file from a list of language-tagged text segments.
This is for handling mixed-language sentences.
"""
pass
\ No newline at end of file
import os
import httpx
from typing import List, Dict
from .base_tts_service import BaseTTSService
class CustomTTSService(BaseTTSService):
......@@ -8,9 +10,11 @@ class CustomTTSService(BaseTTSService):
"""
def __init__(self):
# Read the URL of our FastAPI server from an environment variable
self.api_url = os.getenv("CUSTOM_TTS_URL", "http://localhost:5000/synthesize")
self._is_available = bool(self.api_url)
print(f"Custom TTS Service initialized. API URL: {self.api_url}")
base_url = os.getenv("CUSTOM_TTS_URL", "http://localhost:5000")
self.api_url = f"{base_url}/synthesize"
self.sequence_api_url = f"{base_url}/synthesize_sequence"
self._is_available = bool(base_url)
print(f"Custom TTS Service initialized. Base URL: {base_url}")
def is_available(self) -> bool:
return self._is_available
......@@ -42,4 +46,24 @@ class CustomTTSService(BaseTTSService):
except httpx.RequestError as e:
print(f"Error calling custom TTS service: {e}")
# Re-raise as a standard ConnectionError
raise ConnectionError(f"Failed to connect to custom TTS service at {self.api_url}") from e
\ No newline at end of file
raise ConnectionError(f"Failed to connect to custom TTS service at {self.api_url}") from e
def generate_speech_from_sequence(self, segments: List[Dict[str, str]]) -> bytes:
"""Makes a single POST request with the list of segments."""
if not self.is_available():
raise ConnectionError("Custom TTS service is not configured.")
try:
with httpx.Client() as client:
print(f"Sending sequence of {len(segments)} segments to custom TTS service.")
response = client.post(
self.sequence_api_url,
json={"segments": segments}, # Send the list directly
timeout=300.0 # Longer timeout for sequence processing
)
response.raise_for_status()
audio_bytes = response.content
print("Successfully received stitched audio from custom TTS service.")
return audio_bytes
except httpx.RequestError as e:
raise ConnectionError(f"Failed to connect to custom TTS at {self.sequence_api_url}") from e
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment