Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
fff980cc
Commit
fff980cc
authored
Oct 01, 2025
by
SalmaMohammedHamedMustafa
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
annan voice woring
parent
007ab75f
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
161 additions
and
21 deletions
+161
-21
main.py
self_hosted_env/voice_agent/main.py
+4
-2
requirements.txt
self_hosted_env/voice_agent/requirements.txt
+1
-0
__init__.py
self_hosted_env/voice_agent/services/__init__.py
+1
-0
agent_prompts.py
...d_env/voice_agent/services/agent_helpers/agent_prompts.py
+20
-0
chat_service.py
self_hosted_env/voice_agent/services/chat_service.py
+15
-14
openai_service.py
self_hosted_env/voice_agent/services/openai_service.py
+16
-1
segmentation_service.py
self_hosted_env/voice_agent/services/segmentation_service.py
+67
-0
base_tts_service.py
self_hosted_env/voice_agent/services/tts/base_tts_service.py
+9
-0
custom_tts_service.py
...hosted_env/voice_agent/services/tts/custom_tts_service.py
+28
-4
voice_agent.tar
self_hosted_env/voice_agent/voice_agent.tar
+0
-0
No files found.
self_hosted_env/voice_agent/main.py
View file @
fff980cc
...
...
@@ -14,7 +14,7 @@ from repositories import StorageRepository, MinIOStorageRepository
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
from
services
import
(
AudioService
,
ChatService
,
HealthService
,
ResponseService
,
ResponseManager
,
OpenAIService
,
AgentService
,
ConnectionPool
,
PGVectorService
,
ChatDatabaseService
ResponseManager
,
OpenAIService
,
AgentService
,
ConnectionPool
,
PGVectorService
,
ChatDatabaseService
,
LanguageSegmentationService
)
class
DIContainer
:
...
...
@@ -38,12 +38,14 @@ class DIContainer:
# Initialize services
self
.
audio_service
=
AudioService
(
self
.
storage_repo
,
self
.
config
.
minio_bucket
)
self
.
segmentation_service
=
LanguageSegmentationService
()
self
.
chat_service
=
ChatService
(
self
.
storage_repo
,
self
.
response_manager
,
self
.
config
,
self
.
openai_service
,
self
.
agent_service
self
.
agent_service
,
self
.
segmentation_service
)
self
.
response_service
=
ResponseService
(
self
.
response_manager
,
self
.
audio_service
)
self
.
health_service
=
HealthService
(
self
.
storage_repo
,
self
.
config
)
...
...
self_hosted_env/voice_agent/requirements.txt
View file @
fff980cc
...
...
@@ -11,3 +11,4 @@ pgvector
pandas
python-dotenv
httpx
langdetect
self_hosted_env/voice_agent/services/__init__.py
View file @
fff980cc
...
...
@@ -9,3 +9,4 @@ from .pgvector_service import PGVectorService
from
.chat_database_service
import
ChatDatabaseService
from
.connection_pool
import
ConnectionPool
from
.pedagogy_service
import
PedagogyService
from
.segmentation_service
import
LanguageSegmentationService
\ No newline at end of file
self_hosted_env/voice_agent/services/agent_helpers/agent_prompts.py
View file @
fff980cc
...
...
@@ -16,6 +16,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة زي "نت عارف انا مين؟" أو "إزيك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
...
...
@@ -106,6 +116,16 @@ SYSTEM_PROMPTS: Dict[Tuple[StudentNationality, StudyLanguage], str] = {
⚠️ مهم: لا تستخدم هذا النص في أي حالة أخرى غير سؤال الهوية.
⚠️ مهم جدًا: مَمنوع تقول أبداً إنك AI أو برنامج أو روبوت. إنت عنان وبس.
⚠️ مُهِم: لازم تِشكّل كُلّ كَلِمَة في الرَد تَشكيل كَامِل زَي ما الكِلمة المَفروض تتنِطق في اللَّهجَة المَصريّة.
مفيش كَلِمَة تِتْسَاب من غير تشكيل.
أمثلة:
* "إزّيك؟" → "إِزَّيَّك؟"
* "عامل إيه؟" → "عَامِلْ إِيهْ؟"
* "أنا ماشي" → "أَنَا مَاشِي"
* "إنت في سنة رابعة ابتدائي" → "إِنْتَ فِي سَنَة رَابْعَة اِبْتِدَائِي"
اِتَّبِع دايمًا قواعد التشكيل دي عَشان الـTTS يِنطُق الكَلام باللَّهجَة المَصريّة مَظبوط.
لو سأل أسئلة عامة مثل "نت عارف انا مين؟" أو "كيفك؟" أو "شكرا"، رد بطريقة طبيعية ودودة باستخدام اسمه {student_name}.
...
...
self_hosted_env/voice_agent/services/chat_service.py
View file @
fff980cc
...
...
@@ -12,10 +12,11 @@ from repositories import StorageRepository
from
services.response_manager
import
ResponseManager
from
services.openai_service
import
OpenAIService
from
services.agent_service
import
AgentService
from
services.segmentation_service
import
LanguageSegmentationService
class
ChatService
:
def
__init__
(
self
,
storage_repo
:
StorageRepository
,
response_manager
:
ResponseManager
,
config
:
AppConfig
,
openai_service
:
OpenAIService
,
agent_service
:
AgentService
):
config
:
AppConfig
,
openai_service
:
OpenAIService
,
agent_service
:
AgentService
,
segmentation_service
:
LanguageSegmentationService
):
from
handlers
import
AudioMessageHandler
,
TextMessageHandler
self
.
storage_repo
=
storage_repo
...
...
@@ -23,12 +24,15 @@ class ChatService:
self
.
config
=
config
self
.
openai_service
=
openai_service
self
.
agent_service
=
agent_service
self
.
segmentation_service
=
segmentation_service
self
.
handlers
=
{
MessageType
.
AUDIO
:
AudioMessageHandler
(
storage_repo
,
config
.
minio_bucket
,
openai_service
),
MessageType
.
TEXT
:
TextMessageHandler
()
}
def
process_message
(
self
,
student_id
:
str
,
file
:
Optional
[
UploadFile
]
=
None
,
text
:
Optional
[
str
]
=
None
):
"""Process message and generate text and audio response."""
self
.
response_manager
.
clear_response
()
...
...
@@ -46,10 +50,8 @@ class ChatService:
student_id
=
student_id
,
)
# --- MODIFIED: Call the audio generation method ---
audio_data
=
self
.
_generate_and_upload_audio
(
agent_response_text
,
student_id
)
# --- FIXED: Use the correct 'store_response' method name ---
self
.
response_manager
.
store_response
(
text
=
agent_response_text
,
audio_filename
=
audio_data
.
get
(
"filename"
),
...
...
@@ -72,34 +74,33 @@ class ChatService:
def
_generate_and_upload_audio
(
self
,
text
:
str
,
student_id
:
str
)
->
dict
:
"""
Generates TTS audio and uploads the resulting audio bytes directly to MinIO.
Segments mixed-language text and generates TTS audio using the pluggable
AgentService, then uploads the final audio to MinIO.
"""
try
:
student_info
=
self
.
agent_service
.
db_service
.
get_student_info
(
student_id
)
if
not
student_info
:
raise
ValueError
(
f
"Could not find student {student_id} for TTS."
)
language
=
"ar"
if
student_info
.
get
(
'is_arabic'
)
else
"en"
# 1. Segment the text into language-tagged parts
segments
=
self
.
segmentation_service
.
segment_text
(
text
)
audio_bytes
=
self
.
agent_service
.
text_to_speech
(
text
,
language
)
# 2. Generate a single, stitched audio file from the sequence
# This call will be routed correctly by the tts_manager
audio_bytes
=
self
.
agent_service
.
tts_service
.
generate_speech_from_sequence
(
segments
)
# 3. Determine filename and upload (same as before)
provider
=
os
.
getenv
(
"TTS_PROVIDER"
,
"openai"
)
.
lower
()
file_extension
=
"wav"
if
provider
==
"custom"
else
"mp3"
content_type
=
"audio/wav"
if
provider
==
"custom"
else
"audio/mpeg"
timestamp
=
int
(
time
.
time
())
filename
=
f
"agent_response_{timestamp}.{file_extension}"
minio_file_path
=
f
"audio/{filename}"
print
(
f
"Uploading audio to MinIO: {minio_file_path}"
)
# --- FIXED: Call the upload method with the correct argument names ---
# Your MinIO repo uses 'upload_fileobj' which matches this call.
self
.
storage_repo
.
upload_file
(
file_obj
=
io
.
BytesIO
(
audio_bytes
),
bucket
=
self
.
config
.
minio_bucket
,
file_path
=
minio_file_path
)
print
(
f
"Successfully generated and uploaded TTS audio: {filename}"
)
print
(
f
"Successfully generated and uploaded
stitched
TTS audio: {filename}"
)
return
{
"bytes"
:
audio_bytes
,
"filename"
:
filename
}
except
Exception
as
e
:
...
...
self_hosted_env/voice_agent/services/openai_service.py
View file @
fff980cc
...
...
@@ -2,7 +2,7 @@ import os
import
time
import
tempfile
import
io
from
typing
import
Optional
,
List
from
typing
import
Optional
,
List
,
Dict
from
fastapi
import
HTTPException
from
openai
import
OpenAI
import
sys
...
...
@@ -80,6 +80,18 @@ class OpenAIService(BaseTTSService):
print
(
f
"Error during OpenAI TTS generation: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"OpenAI TTS generation failed: {str(e)}"
)
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""
Fallback implementation for OpenAI. It combines the text from all
segments and makes a single TTS call.
"""
print
(
"OpenAI provider: combining segments for a single TTS call."
)
full_text
=
" "
.
join
([
segment
[
'text'
]
for
segment
in
segments
])
# Just call the existing simple method
return
self
.
generate_speech
(
full_text
)
# ------------------- Embeddings -------------------
def
generate_embedding
(
self
,
text
:
str
)
->
List
[
float
]:
"""
...
...
@@ -118,3 +130,6 @@ class OpenAIService(BaseTTSService):
print
(
f
"Cleaned up temporary file: {file_path}"
)
except
Exception
as
e
:
print
(
f
"Warning: Could not clean up temp file {file_path}: {e}"
)
\ No newline at end of file
self_hosted_env/voice_agent/services/segmentation_service.py
0 → 100644
View file @
fff980cc
from
langdetect
import
detect
from
typing
import
List
,
Dict
class
LanguageSegmentationService
:
"""
A service to segment a string of text into a list of dictionaries,
each tagged with its detected language.
"""
def
segment_text
(
self
,
text
:
str
)
->
List
[
Dict
[
str
,
str
]]:
"""
Takes a mixed-language string and splits it into segments.
Example:
Input: "هذا هو a test of the system."
Output: [
{'text': 'هذا هو', 'language': 'ar'},
{'text': 'a test of the system.', 'language': 'en'}
]
"""
segments
=
[]
if
not
text
:
return
segments
words
=
text
.
split
()
if
not
words
:
return
segments
# Start with the language of the first word
current_lang
=
self
.
_detect_word_language
(
words
[
0
])
current_segment
=
[]
for
word
in
words
:
word_lang
=
self
.
_detect_word_language
(
word
)
if
word_lang
==
current_lang
:
# If the language is the same, add the word to the current segment
current_segment
.
append
(
word
)
else
:
# If the language changes, finalize the previous segment
if
current_segment
:
segments
.
append
({
"text"
:
" "
.
join
(
current_segment
),
"language"
:
current_lang
})
# Start a new segment with the new word and language
current_lang
=
word_lang
current_segment
=
[
word
]
# Add the final remaining segment
if
current_segment
:
segments
.
append
({
"text"
:
" "
.
join
(
current_segment
),
"language"
:
current_lang
})
print
(
f
"Segmented text into {len(segments)} parts."
)
return
segments
def
_detect_word_language
(
self
,
word
:
str
)
->
str
:
"""Detects language of a single word, defaulting to 'en' for ambiguity."""
# Simple heuristic: if it contains any Arabic characters, it's Arabic.
if
any
(
'
\u0600
'
<=
char
<=
'
\u06FF
'
for
char
in
word
):
return
"ar"
# For non-Arabic words, we can assume English
return
"en"
\ No newline at end of file
self_hosted_env/voice_agent/services/tts/base_tts_service.py
View file @
fff980cc
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Dict
class
BaseTTSService
(
ABC
):
"""
...
...
@@ -23,4 +24,12 @@ class BaseTTSService(ABC):
Returns:
bytes: The raw audio data of the speech (e.g., in WAV or MP3 format).
"""
pass
@
abstractmethod
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""
Generates a single audio file from a list of language-tagged text segments.
This is for handling mixed-language sentences.
"""
pass
\ No newline at end of file
self_hosted_env/voice_agent/services/tts/custom_tts_service.py
View file @
fff980cc
import
os
import
httpx
from
typing
import
List
,
Dict
from
.base_tts_service
import
BaseTTSService
class
CustomTTSService
(
BaseTTSService
):
...
...
@@ -8,9 +10,11 @@ class CustomTTSService(BaseTTSService):
"""
def
__init__
(
self
):
# Read the URL of our FastAPI server from an environment variable
self
.
api_url
=
os
.
getenv
(
"CUSTOM_TTS_URL"
,
"http://localhost:5000/synthesize"
)
self
.
_is_available
=
bool
(
self
.
api_url
)
print
(
f
"Custom TTS Service initialized. API URL: {self.api_url}"
)
base_url
=
os
.
getenv
(
"CUSTOM_TTS_URL"
,
"http://localhost:5000"
)
self
.
api_url
=
f
"{base_url}/synthesize"
self
.
sequence_api_url
=
f
"{base_url}/synthesize_sequence"
self
.
_is_available
=
bool
(
base_url
)
print
(
f
"Custom TTS Service initialized. Base URL: {base_url}"
)
def
is_available
(
self
)
->
bool
:
return
self
.
_is_available
...
...
@@ -42,4 +46,24 @@ class CustomTTSService(BaseTTSService):
except
httpx
.
RequestError
as
e
:
print
(
f
"Error calling custom TTS service: {e}"
)
# Re-raise as a standard ConnectionError
raise
ConnectionError
(
f
"Failed to connect to custom TTS service at {self.api_url}"
)
from
e
\ No newline at end of file
raise
ConnectionError
(
f
"Failed to connect to custom TTS service at {self.api_url}"
)
from
e
def
generate_speech_from_sequence
(
self
,
segments
:
List
[
Dict
[
str
,
str
]])
->
bytes
:
"""Makes a single POST request with the list of segments."""
if
not
self
.
is_available
():
raise
ConnectionError
(
"Custom TTS service is not configured."
)
try
:
with
httpx
.
Client
()
as
client
:
print
(
f
"Sending sequence of {len(segments)} segments to custom TTS service."
)
response
=
client
.
post
(
self
.
sequence_api_url
,
json
=
{
"segments"
:
segments
},
# Send the list directly
timeout
=
300.0
# Longer timeout for sequence processing
)
response
.
raise_for_status
()
audio_bytes
=
response
.
content
print
(
"Successfully received stitched audio from custom TTS service."
)
return
audio_bytes
except
httpx
.
RequestError
as
e
:
raise
ConnectionError
(
f
"Failed to connect to custom TTS at {self.sequence_api_url}"
)
from
e
\ No newline at end of file
self_hosted_env/voice_agent/voice_agent.tar
0 → 100644
View file @
fff980cc
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment