Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
fff980cc
Commit
fff980cc
authored
Oct 01, 2025
by
SalmaMohammedHamedMustafa
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
annan voice woring
parent
007ab75f
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
161 additions
and
21 deletions
+161
-21
main.py
self_hosted_env/voice_agent/main.py
+4
-2
requirements.txt
self_hosted_env/voice_agent/requirements.txt
+1
-0
__init__.py
self_hosted_env/voice_agent/services/__init__.py
+1
-0
agent_prompts.py
...d_env/voice_agent/services/agent_helpers/agent_prompts.py
+20
-0
chat_service.py
self_hosted_env/voice_agent/services/chat_service.py
+15
-14
openai_service.py
self_hosted_env/voice_agent/services/openai_service.py
+16
-1
segmentation_service.py
self_hosted_env/voice_agent/services/segmentation_service.py
+67
-0
base_tts_service.py
self_hosted_env/voice_agent/services/tts/base_tts_service.py
+9
-0
custom_tts_service.py
...hosted_env/voice_agent/services/tts/custom_tts_service.py
+28
-4
voice_agent.tar
self_hosted_env/voice_agent/voice_agent.tar
+0
-0
No files found.
self_hosted_env/voice_agent/main.py
View file @
fff980cc
...
@@ -14,7 +14,7 @@ from repositories import StorageRepository, MinIOStorageRepository
...
@@ -14,7 +14,7 @@ from repositories import StorageRepository, MinIOStorageRepository
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
from
services
import
(
from
services
import
(
AudioService
,
ChatService
,
HealthService
,
ResponseService
,
AudioService
,
ChatService
,
HealthService
,
ResponseService
,
ResponseManager
,
OpenAIService
,
AgentService
,
ConnectionPool
,
PGVectorService
,
ChatDatabaseService
ResponseManager
,
OpenAIService
,
AgentService
,
ConnectionPool
,
PGVectorService
,
ChatDatabaseService
,
LanguageSegmentationService
)
)
class
DIContainer
:
class
DIContainer
:
...
@@ -38,12 +38,14 @@ class DIContainer:
...
@@ -38,12 +38,14 @@ class DIContainer:
# Initialize services
# Initialize services
self
.
audio_service
=
AudioService
(
self
.
storage_repo
,
self
.
config
.
minio_bucket
)
self
.
audio_service
=
AudioService
(
self
.
storage_repo
,
self
.
config
.
minio_bucket
)
self
.
segmentation_service
=
LanguageSegmentationService
()
self
.
chat_service
=
ChatService
(
self
.
chat_service
=
ChatService
(
self
.
storage_repo
,
self
.
storage_repo
,
self
.
response_manager
,
self
.
response_manager
,
self
.
config
,
self
.
config
,
self
.
openai_service
,
self
.
openai_service
,
self
.
agent_service
self
.
agent_service
,
self
.
segmentation_service
)
)
self
.
response_service
=
ResponseService
(
self
.
response_manager
,
self
.
audio_service
)
self
.
response_service
=
ResponseService
(
self
.
response_manager
,
self
.
audio_service
)
self
.
health_service
=
HealthService
(
self
.
storage_repo
,
self
.
config
)
self
.
health_service
=
HealthService
(
self
.
storage_repo
,
self
.
config
)
...
...
self_hosted_env/voice_agent/requirements.txt
View file @
fff980cc
...
@@ -11,3 +11,4 @@ pgvector
...
@@ -11,3 +11,4 @@ pgvector
pandas
pandas
python-dotenv
python-dotenv
httpx
httpx
langdetect
self_hosted_env/voice_agent/services/__init__.py
View file @
fff980cc
...
@@ -9,3 +9,4 @@ from .pgvector_service import PGVectorService
...
@@ -9,3 +9,4 @@ from .pgvector_service import PGVectorService
from
.chat_database_service
import
ChatDatabaseService
from
.chat_database_service
import
ChatDatabaseService
from
.connection_pool
import
ConnectionPool
from
.connection_pool
import
ConnectionPool
from
.pedagogy_service
import
PedagogyService
from
.pedagogy_service
import
PedagogyService
from
.segmentation_service
import
LanguageSegmentationService
\ No newline at end of file
self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
View file @
fff980cc
...
@@ -16,6 +16,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
...
@@ -16,6 +16,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة زي "نت عارف انا مين؟" أو "إزيك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
لو سأل أسئلة عامة زي "نت عارف انا مين؟" أو "إزيك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
...
@@ -106,6 +116,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
...
@@ -106,6 +116,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة مثل "نت عارف انا مين؟" أو "كيفك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
لو سأل أسئلة عامة مثل "نت عارف انا مين؟" أو "كيفك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
...
...
self_hosted_env/voice_agent/services/chat_service.py
View file @
fff980cc
...
@@ -12,10 +12,11 @@ from repositories import StorageRepository
...
@@ -12,10 +12,11 @@ from repositories import StorageRepository
from
services.response_manager
import
ResponseManager
from
services.response_manager
import
ResponseManager
from
services.openai_service
import
OpenAIService
from
services.openai_service
import
OpenAIService
from
services.agent_service
import
AgentService
from
services.agent_service
import
AgentService
from
services.segmentation_service
import
LanguageSegmentationService
class
ChatService
:
class
ChatService
:
def
__init__
(
self
,
storage_repo
:
StorageRepository
,
response_manager
:
ResponseManager
,
def
__init__
(
self
,
storage_repo
:
StorageRepository
,
response_manager
:
ResponseManager
,
config
:
AppConfig
,
openai_service
:
OpenAIService
,
agent_service
:
AgentService
):
config
:
AppConfig
,
openai_service
:
OpenAIService
,
agent_service
:
AgentService
,
segmentation_service
:
LanguageSegmentationService
):
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
self
.
storage_repo
=
storage_repo
self
.
storage_repo
=
storage_repo
...
@@ -23,12 +24,15 @@ class ChatService:
...
@@ -23,12 +24,15 @@ class ChatService:
self
.
config
=
config
self
.
config
=
config
self
.
openai_service
=
openai_service
self
.
openai_service
=
openai_service
self
.
agent_service
=
agent_service
self
.
agent_service
=
agent_service
self
.
segmentation_service
=
segmentation_service
self
.
handlers
=
{
self
.
handlers
=
{
MessageType
.
AUDIO
:
AudioMessageHandler
(
storage_repo
,
config
.
minio_bucket
,
openai_service
),
MessageType
.
AUDIO
:
AudioMessageHandler
(
storage_repo
,
config
.
minio_bucket
,
openai_service
),
MessageType
.
TEXT
:
TextMessageHandler
()
MessageType
.
TEXT
:
TextMessageHandler
()
}
}
def
process_message
(
self
,
student_id
:
str
,
file
:
Optional
[
UploadFile
]
=
None
,
text
:
Optional
[
str
]
=
None
):
def
process_message
(
self
,
student_id
:
str
,
file
:
Optional
[
UploadFile
]
=
None
,
text
:
Optional
[
str
]
=
None
):
"""Process message and generate text and audio response."""
"""Process message and generate text and audio response."""
self
.
response_manager
.
clear_response
()
self
.
response_manager
.
clear_response
()
...
@@ -46,10 +50,8 @@ class ChatService:
...
@@ -46,10 +50,8 @@ class ChatService:
student_id
=
student_id
,
student_id
=
student_id
,
)
)
# --- MODIFIED: Call the audio generation method ---
audio_data
=
self
.
_generate_and_upload_audio
(
agent_response_text
,
student_id
)
audio_data
=
self
.
_generate_and_upload_audio
(
agent_response_text
,
student_id
)
# --- FIXED: Use the correct 'store_response' method name ---
self
.
response_manager
.
store_response
(
self
.
response_manager
.
store_response
(
text
=
agent_response_text
,
text
=
agent_response_text
,
audio_filename
=
audio_data
.
get
(
"filename"
),
audio_filename
=
audio_data
.
get
(
"filename"
),
...
@@ -72,34 +74,33 @@ class ChatService:
...
@@ -72,34 +74,33 @@ class ChatService:
def
_generate_and_upload_audio
(
self
,
text
:
str
,
student_id
:
str
)
->
dict
:
def
_generate_and_upload_audio
(
self
,
text
:
str
,
student_id
:
str
)
->
dict
:
"""
"""
Generates TTS audio and uploads the resulting audio bytes directly to MinIO.
Segments mixed-language text and generates TTS audio using the pluggable
AgentService, then uploads the final audio to MinIO.
"""
"""
try
:
try
:
student_info
=
self
.
agent_service
.
db_service
.
get_student_info
(
student_id
)
# 1. Segment the text into language-tagged parts
if
not
student_info
:
segments
=
self
.
segmentation_service
.
segment_text
(
text
)
raise
ValueError
(
f
"Could not find student {student_id} for TTS."
)
language
=
"ar"
if
student_info
.
get
(
'is_arabic'
)
else
"en"
audio_bytes
=
self
.
agent_service
.
text_to_speech
(
text
,
language
)
# 2. Generate a single, stitched audio file from the sequence
# This call will be routed correctly by the tts_manager
audio_bytes
=
self
.
agent_service
.
tts_service
.
generate_speech_from_sequence
(
segments
)
# 3. Determine filename and upload (same as before)
provider
=
os
.
getenv
(
"TTS_PROVIDER"
,
"openai"
)
.
lower
()
provider
=
os
.
getenv
(
"TTS_PROVIDER"
,
"openai"
)
.
lower
()
file_extension
=
"wav"
if
provider
==
"custom"
else
"mp3"
file_extension
=
"wav"
if
provider
==
"custom"
else
"mp3"
content_type
=
"audio/wav"
if
provider
==
"custom"
else
"audio/mpeg"
timestamp
=
int
(
time
.
time
())
timestamp
=
int
(
time
.
time
())
filename
=
f
"agent_response_{timestamp}.{file_extension}"
filename
=
f
"agent_response_{timestamp}.{file_extension}"
minio_file_path
=
f
"audio/{filename}"
minio_file_path
=
f
"audio/{filename}"
print
(
f
"Uploading audio to MinIO: {minio_file_path}"
)
# --- FIXED: Call the upload method with the correct argument names ---
# Your MinIO repo uses 'upload_fileobj' which matches this call.
self
.
storage_repo
.
upload_file
(
self
.
storage_repo
.
upload_file
(
file_obj
=
io
.
BytesIO
(
audio_bytes
),
file_obj
=
io
.
BytesIO
(
audio_bytes
),
bucket
=
self
.
config
.
minio_bucket
,
bucket
=
self
.
config
.
minio_bucket
,
file_path
=
minio_file_path
file_path
=
minio_file_path
)
)
print
(
f
"Successfully generated and uploaded TTS audio: {filename}"
)
print
(
f
"Successfully generated and uploaded
stitched
TTS audio: {filename}"
)
return
{
"bytes"
:
audio_bytes
,
"filename"
:
filename
}
return
{
"bytes"
:
audio_bytes
,
"filename"
:
filename
}
except
Exception
as
e
:
except
Exception
as
e
:
...
...
self_hosted_env/voice_agent/services/openai_service.py
View file @
fff980cc
...
@@ -2,7 +2,7 @@ import os
...
@@ -2,7 +2,7 @@ import os
import
time
import
time
import
tempfile
import
tempfile
import
io
import
io
from
typing
import
Optional
,
List
from
typing
import
Optional
,
List
,
Dict
from
fastapi
import
HTTPException
from
fastapi
import
HTTPException
from
openai
import
OpenAI
from
openai
import
OpenAI
import
sys
import
sys
...
@@ -80,6 +80,18 @@ class OpenAIService(BaseTTSService):
...
@@ -80,6 +80,18 @@ class OpenAIService(BaseTTSService):
print
(
f
"Error during OpenAI TTS generation: {e}"
)
print
(
f
"Error during OpenAI TTS generation: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"OpenAI TTS generation failed: {str(e)}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"OpenAI TTS generation failed: {str(e)}"
)
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""
Fallback implementation for OpenAI. It combines the text from all
segments and makes a single TTS call.
"""
print
(
"OpenAI provider: combining segments for a single TTS call."
)
full_text
=
" "
.
join
([
segment
[
'text'
]
for
segment
in
segments
])
# Just call the existing simple method
return
self
.
generate_speech
(
full_text
)
# ------------------- Embeddings -------------------
# ------------------- Embeddings -------------------
def
generate_embedding
(
self
,
text
:
str
)
->
List
[
float
]:
def
generate_embedding
(
self
,
text
:
str
)
->
List
[
float
]:
"""
"""
...
@@ -118,3 +130,6 @@ class OpenAIService(BaseTTSService):
...
@@ -118,3 +130,6 @@ class OpenAIService(BaseTTSService):
print
(
f
"Cleaned up temporary file: {file_path}"
)
print
(
f
"Cleaned up temporary file: {file_path}"
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"Warning: Could not clean up temp file {file_path}: {e}"
)
print
(
f
"Warning: Could not clean up temp file {file_path}: {e}"
)
\ No newline at end of file
self_hosted_env/voice_agent/services/segmentation_service.py
0 → 100644
View file @
fff980cc
from
langdetect
import
detect
from
typing
import
List
,
Dict
class
LanguageSegmentationService
:
"""
A service to segment a string of text into a list of dictionaries,
each tagged with its detected language.
"""
def
segment_text
(
self
,
text
:
str
)
->
List
[
Dict
[
str
,
str
]]:
"""
Takes a mixed-language string and splits it into segments.
Example:
Input: "هذا هو a test of the system."
Output: [
{'text': 'هذا هو', 'language': 'ar'},
{'text': 'a test of the system.', 'language': 'en'}
]
"""
segments
=
[]
if
not
text
:
return
segments
words
=
text
.
split
()
if
not
words
:
return
segments
# Start with the language of the first word
current_lang
=
self
.
_detect_word_language
(
words
[
0
])
current_segment
=
[]
for
word
in
words
:
word_lang
=
self
.
_detect_word_language
(
word
)
if
word_lang
==
current_lang
:
# If the language is the same, add the word to the current segment
current_segment
.
append
(
word
)
else
:
# If the language changes, finalize the previous segment
if
current_segment
:
segments
.
append
({
"text"
:
" "
.
join
(
current_segment
),
"language"
:
current_lang
})
# Start a new segment with the new word and language
current_lang
=
word_lang
current_segment
=
[
word
]
# Add the final remaining segment
if
current_segment
:
segments
.
append
({
"text"
:
" "
.
join
(
current_segment
),
"language"
:
current_lang
})
print
(
f
"Segmented text into {len(segments)} parts."
)
return
segments
def
_detect_word_language
(
self
,
word
:
str
)
->
str
:
"""Detects language of a single word, defaulting to 'en' for ambiguity."""
# Simple heuristic: if it contains any Arabic characters, it's Arabic.
if
any
(
'
\u0600
'
<=
char
<=
'
\u06FF
'
for
char
in
word
):
return
"ar"
# For non-Arabic words, we can assume English
return
"en"
\ No newline at end of file
self_hosted_env/voice_agent/services/tts/base_tts_service.py
View file @
fff980cc
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Dict
class
BaseTTSService
(
ABC
):
class
BaseTTSService
(
ABC
):
"""
"""
...
@@ -23,4 +24,12 @@ class BaseTTSService(ABC):
...
@@ -23,4 +24,12 @@ class BaseTTSService(ABC):
Returns:
Returns:
bytes: The raw audio data of the speech (e.g., in WAV or MP3 format).
bytes: The raw audio data of the speech (e.g., in WAV or MP3 format).
"""
"""
pass
@
abstractmethod
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""
Generates a single audio file from a list of language-tagged text segments.
This is for handling mixed-language sentences.
"""
pass
pass
\ No newline at end of file
self_hosted_env/voice_agent/services/tts/custom_tts_service.py
View file @
fff980cc
import
os
import
os
import
httpx
import
httpx
from
typing
import
List
,
Dict
from
.base_tts_service
import
BaseTTSService
from
.base_tts_service
import
BaseTTSService
class
CustomTTSService
(
BaseTTSService
):
class
CustomTTSService
(
BaseTTSService
):
...
@@ -8,9 +10,11 @@ class CustomTTSService(BaseTTSService):
...
@@ -8,9 +10,11 @@ class CustomTTSService(BaseTTSService):
"""
"""
def
__init__
(
self
):
def
__init__
(
self
):
# Read the URL of our FastAPI server from an environment variable
# Read the URL of our FastAPI server from an environment variable
self
.
api_url
=
os
.
getenv
(
"CUSTOM_TTS_URL"
,
"http://localhost:5000/synthesize"
)
base_url
=
os
.
getenv
(
"CUSTOM_TTS_URL"
,
"http://localhost:5000"
)
self
.
_is_available
=
bool
(
self
.
api_url
)
self
.
api_url
=
f
"{base_url}/synthesize"
print
(
f
"Custom TTS Service initialized. API URL: {self.api_url}"
)
self
.
sequence_api_url
=
f
"{base_url}/synthesize_sequence"
self
.
_is_available
=
bool
(
base_url
)
print
(
f
"Custom TTS Service initialized. Base URL: {base_url}"
)
def
is_available
(
self
)
->
bool
:
def
is_available
(
self
)
->
bool
:
return
self
.
_is_available
return
self
.
_is_available
...
@@ -42,4 +46,24 @@ class CustomTTSService(BaseTTSService):
...
@@ -42,4 +46,24 @@ class CustomTTSService(BaseTTSService):
except
httpx
.
RequestError
as
e
:
except
httpx
.
RequestError
as
e
:
print
(
f
"Error calling custom TTS service: {e}"
)
print
(
f
"Error calling custom TTS service: {e}"
)
# Re-raise as a standard ConnectionError
# Re-raise as a standard ConnectionError
raise
ConnectionError
(
f
"Failed to connect to custom TTS service at {self.api_url}"
)
from
e
raise
ConnectionError
(
f
"Failed to connect to custom TTS service at {self.api_url}"
)
from
e
\ No newline at end of file
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""Makes a single POST request with the list of segments."""
if
not
self
.
is_available
():
raise
ConnectionError
(
"Custom TTS service is not configured."
)
try
:
with
httpx
.
Client
()
as
client
:
print
(
f
"Sending sequence of {len(segments)} segments to custom TTS service."
)
response
=
client
.
post
(
self
.
sequence_api_url
,
json
=
{
"segments"
:
segments
},
# Send the list directly
timeout
=
300.0
# Longer timeout for sequence processing
)
response
.
raise_for_status
()
audio_bytes
=
response
.
content
print
(
"Successfully received stitched audio from custom TTS service."
)
return
audio_bytes
except
httpx
.
RequestError
as
e
:
raise
ConnectionError
(
f
"Failed to connect to custom TTS at {self.sequence_api_url}"
)
from
e
\ No newline at end of file
self_hosted_env/voice_agent/voice_agent.tar
0 → 100644
View file @
fff980cc
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment