Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
e61483ee
Commit
e61483ee
authored
Oct 08, 2025
by
arwa mohamed
Browse files
Options
Browse Files
Download
Plain Diff
tashkeel edit
parents
d33b8e1e
23772a37
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
154 additions
and
33 deletions
+154
-33
config.py
TTS/machine_code/config.py
+13
-4
main.py
TTS/machine_code/main.py
+74
-0
synthesizer_service.py
TTS/machine_code/synthesizer_service.py
+1
-1
tts_service.py
TTS/machine_code/tts_service.py
+66
-28
No files found.
TTS/machine_code/config.py
View file @
e61483ee
...
...
@@ -12,15 +12,17 @@ class TTSConfig(BaseModel):
ARABIC_MODEL_CONFIG
=
TTSConfig
(
language
=
"ar"
,
model_name_or_path
=
"./model/EGTTS-V0.1/"
,
speaker_directory
=
"
salma
"
,
speaker_directory
=
"
anan
"
,
config_path
=
"./model/EGTTS-V0.1/config.json"
,
vocab_path
=
"./model/EGTTS-V0.1/vocab.json"
)
ENGLISH_MODEL_CONFIG
=
TTSConfig
(
language
=
"en"
,
model_name_or_path
=
"tts_models/multilingual/multi-dataset/xtts_v2"
,
speaker_directory
=
"anan"
model_name_or_path
=
"./model_en/"
,
speaker_directory
=
"anan"
,
config_path
=
"./model_en/config.json"
,
vocab_path
=
"./model_en/vocab.json"
)
...
...
@@ -28,3 +30,10 @@ SUPPORTED_MODELS = {
"ar"
:
ARABIC_MODEL_CONFIG
,
"en"
:
ENGLISH_MODEL_CONFIG
,
}
inference_config
=
{
"temperature"
:
0.1
,
"length_penalty"
:
0.9
,
"repetition_penalty"
:
1.2
,
"enable_text_splitting"
:
True
,
}
TTS/machine_code/main.py
0 → 100644
View file @
e61483ee
import
torch
import
soundfile
as
sf
import
io
import
warnings
import
logging
import
numpy
as
np
import
nltk
from
fastapi
import
FastAPI
,
HTTPException
from
fastapi.responses
import
StreamingResponse
,
JSONResponse
from
config
import
SUPPORTED_MODELS
from
schemas
import
SynthesisRequest
,
SequenceSynthesisRequest
from
tts_service
import
TTSModel
from
synthesizer_service
import
SynthesizerService
# <-- Import the new service
# --- Suppress Warnings ---
warnings
.
filterwarnings
(
'ignore'
,
category
=
UserWarning
)
warnings
.
filterwarnings
(
'ignore'
,
category
=
FutureWarning
)
logging
.
getLogger
(
"transformers"
)
.
setLevel
(
logging
.
ERROR
)
# --- Application Setup ---
app
=
FastAPI
()
# This single object will now hold our application's state and logic
synthesizer
=
None
# --- Model Loading on Startup ---
@
app
.
on_event
(
"startup"
)
def
startup_event
():
global
synthesizer
nltk
.
download
(
'punkt'
)
nltk
.
download
(
'punkt_tab'
)
use_gpu
=
torch
.
cuda
.
is_available
()
print
(
f
"GPU Available: {use_gpu}"
)
models
=
{}
batch_size
=
12
for
lang
,
config
in
SUPPORTED_MODELS
.
items
():
model
=
TTSModel
(
config
,
use_gpu
=
use_gpu
,
batch_size
=
batch_size
)
model
.
load
()
models
[
lang
]
=
model
# Create a single instance of our synthesizer service
synthesizer
=
SynthesizerService
(
models
)
print
(
"Synthesizer service is ready."
)
# --- Helper function to create the audio response ---
def
create_audio_response
(
audio_array
:
np
.
ndarray
)
->
StreamingResponse
:
buffer
=
io
.
BytesIO
()
sf
.
write
(
buffer
,
audio_array
,
24000
,
format
=
'WAV'
)
buffer
.
seek
(
0
)
return
StreamingResponse
(
buffer
,
media_type
=
"audio/wav"
)
# --- API Endpoints (Now clean and thin) ---
@
app
.
post
(
"/synthesize"
)
async
def
synthesize
(
request
:
SynthesisRequest
):
try
:
final_audio
=
synthesizer
.
synthesize_simple
(
request
.
text
,
request
.
language
)
return
create_audio_response
(
final_audio
)
except
Exception
as
e
:
print
(
f
"An error occurred during simple synthesis: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"Failed to generate audio: {str(e)}"
)
@
app
.
post
(
"/synthesize_sequence"
)
async
def
synthesize_sequence
(
request
:
SequenceSynthesisRequest
):
try
:
final_audio
=
synthesizer
.
synthesize_sequence
(
request
.
segments
)
return
create_audio_response
(
final_audio
)
except
Exception
as
e
:
print
(
f
"An error occurred during sequence synthesis: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
f
"Failed to generate audio sequence: {str(e)}"
)
\ No newline at end of file
TTS/machine_code/synthesizer_service.py
View file @
e61483ee
...
...
@@ -46,7 +46,7 @@ class SynthesizerService:
if
lang
not
in
self
.
models
or
not
self
.
models
[
lang
]
.
is_loaded
:
raise
ValueError
(
f
"Model for language '{lang}' is not available."
)
char_limit
=
1
40
if
lang
==
"ar"
else
2
00
char_limit
=
1
000
if
lang
==
"ar"
else
10
00
segment
.
text
=
translate_equations_in_text
(
segment
.
text
,
lang
)
segment
.
text
=
sanitize_text
(
segment
.
text
)
print
(
f
"Segment {seg_idx} ({lang}) text: {segment.text}"
)
...
...
TTS/machine_code/tts_service.py
View file @
e61483ee
...
...
@@ -5,7 +5,7 @@ from TTS.tts.models.xtts import Xtts
from
typing
import
List
import
numpy
as
np
import
os
from
config
import
TTSConfig
from
config
import
TTSConfig
,
inference_config
class
TTSModel
:
"""
...
...
@@ -35,8 +35,7 @@ class TTSModel:
print
(
f
"Found {len(speaker_wav_paths)} reference audio files for voice cloning."
)
# Load the base model (logic is the same)
if
self
.
config
.
language
==
"ar"
:
# Load the base model
conf
=
XttsConfig
()
conf
.
load_json
(
self
.
config
.
config_path
)
self
.
model
=
Xtts
.
init_from_config
(
conf
)
...
...
@@ -48,9 +47,7 @@ class TTSModel:
)
if
self
.
use_gpu
:
self
.
model
.
cuda
()
else
:
api_model
=
TTS
(
model_name
=
self
.
config
.
model_name_or_path
,
gpu
=
self
.
use_gpu
)
self
.
model
=
api_model
.
synthesizer
.
tts_model
print
(
f
"Computing speaker characteristics from {len(speaker_wav_paths)} files..."
)
self
.
gpt_cond_latent
,
self
.
speaker_embedding
=
self
.
model
.
get_conditioning_latents
(
...
...
@@ -65,30 +62,71 @@ class TTSModel:
def
synthesize_chunk
(
self
,
text
:
str
):
if
not
self
.
is_loaded
:
raise
RuntimeError
(
f
"Model for '{self.config.language}' is not loaded."
)
out
=
self
.
model
.
inference
(
text
=
text
,
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
0.1
)
out
=
self
.
model
.
inference
(
text
=
text
,
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
inference_config
[
"temperature"
],
length_penalty
=
inference_config
[
"length_penalty"
],
repetition_penalty
=
inference_config
[
"repetition_penalty"
],
enable_text_splitting
=
inference_config
[
"enable_text_splitting"
]
)
return
out
[
"wav"
]
def
synthesize_batch
(
self
,
texts
:
List
[
str
])
->
List
[
np
.
ndarray
]:
if
not
self
.
is_loaded
:
raise
RuntimeError
(
f
"Model for '{self.config.language}' is not loaded."
)
if
not
texts
:
return
[]
if
not
self
.
is_loaded
:
raise
RuntimeError
(
f
"Model for '{self.config.language}' is not loaded."
)
if
not
texts
:
return
[]
all_audio
=
[]
texts
=
[
t
if
isinstance
(
t
,
str
)
else
" "
.
join
(
t
)
for
t
in
texts
]
for
i
in
range
(
0
,
len
(
texts
),
self
.
batch_size
):
batch_texts
=
texts
[
i
:
i
+
self
.
batch_size
]
print
(
f
"Processing batch {i//self.batch_size + 1}: {len(batch_texts)} chunks"
)
batch_audio
=
[]
try
:
with
torch
.
no_grad
():
for
text
in
batch_texts
:
out
=
self
.
model
.
inference
(
text
=
text
,
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
0.1
)
batch_audio
.
append
(
out
[
"wav"
])
outputs
=
[
self
.
model
.
inference
(
text
=
text
.
strip
(),
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
inference_config
[
"temperature"
],
length_penalty
=
inference_config
[
"length_penalty"
],
repetition_penalty
=
inference_config
[
"repetition_penalty"
],
enable_text_splitting
=
inference_config
[
"enable_text_splitting"
]
)
for
text
in
batch_texts
]
batch_audio
=
[
out
[
"wav"
]
for
out
in
outputs
]
all_audio
.
extend
(
batch_audio
)
if
self
.
use_gpu
:
torch
.
cuda
.
empty_cache
()
if
self
.
use_gpu
:
torch
.
cuda
.
empty_cache
()
except
RuntimeError
as
e
:
if
"out of memory"
in
str
(
e
):
print
(
f
"GPU OOM error. Falling back to sequential processing for this batch."
)
for
text
in
batch_texts
:
out
=
self
.
model
.
inference
(
text
=
text
,
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
0.1
)
out
=
self
.
model
.
inference
(
text
=
text
.
strip
(),
language
=
self
.
config
.
language
,
speaker_embedding
=
self
.
speaker_embedding
,
gpt_cond_latent
=
self
.
gpt_cond_latent
,
temperature
=
inference_config
[
"temperature"
],
length_penalty
=
inference_config
[
"length_penalty"
],
repetition_penalty
=
inference_config
[
"repetition_penalty"
],
enable_text_splitting
=
inference_config
[
"enable_text_splitting"
]
)
all_audio
.
append
(
out
[
"wav"
])
if
self
.
use_gpu
:
torch
.
cuda
.
empty_cache
()
else
:
raise
e
if
self
.
use_gpu
:
torch
.
cuda
.
empty_cache
()
else
:
raise
e
return
all_audio
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment