deployes TTS code on aws

parent 9319d253
from pydantic import BaseModel
class TTSConfig(BaseModel):
"""Holds configuration for a single TTS model."""
language: str
model_name_or_path: str
speaker_wav: str
config_path: str | None = None
vocab_path: str | None = None
ARABIC_MODEL_CONFIG = TTSConfig(
language="ar",
model_name_or_path="./model/EGTTS-V0.1/",
speaker_wav="calm_anan_1.wav",
config_path="./model/EGTTS-V0.1/config.json",
vocab_path="./model/EGTTS-V0.1/vocab.json"
)
ENGLISH_MODEL_CONFIG = TTSConfig(
language="en",
model_name_or_path="tts_models/multilingual/multi-dataset/xtts_v2",
speaker_wav="calm_anan_1.wav"
)
SUPPORTED_MODELS = {
"ar": ARABIC_MODEL_CONFIG,
"en": ENGLISH_MODEL_CONFIG,
}
\ No newline at end of file
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="OmarSamir/EGTTS-V0.1",
repo_type="model",
local_dir="./model/EGTTS-V0.1",
local_dir_use_symlinks=False
)
from TTS.api import TTS
import torch
# --- 1. Server Startup: Load Model ---
use_gpu = torch.cuda.is_available()
print(f"GPU Available: {use_gpu}")
# Initialize the model directly on the GPU if available
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
print("TTS model loaded.")
# --- 2. Server Startup: Pre-calculate Speaker Latents ---
SPEAKER_AUDIO_PATH = "calm_anan_1.wav" # Make sure this path is correct
print(f"Pre-calculating speaker latents from: {SPEAKER_AUDIO_PATH}")
try:
gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
print("Speaker latents calculated and stored successfully.")
except Exception as e:
print(f"Error calculating speaker latents: {e}")
gpt_cond_latent, speaker_embedding = None, None
# --- 3. Inside Your API Endpoint (Handling a Request) ---
if gpt_cond_latent is not None:
text1 = """This is the first sentence. It will use the pre-calculated voice.
This is the second sentence. It will use the same voice as the first one.
This is the third sentence. Again, same voice.
This is the fourth sentence. Still the same voice.
This is the fifth sentence. still the same voice.
This is the sixth sentence. still the same voice.
This is the seventh sentence. still the same voice.
This is the eighth sentence. still the same voice.
This is the ninth sentence. still the same voice.
"""
print(f"Synthesizing: '{text1}'")
# Call the .inference() method on the actual model object
# This bypasses all the simple checks and uses our advanced parameters directly.
out = tts.synthesizer.tts_model.inference( # <--- CHANGE 1: Call .inference()
text=text1,
language="en",
speaker_embedding=speaker_embedding,
gpt_cond_latent=gpt_cond_latent
)
# The .inference() method returns a dictionary, the audio is in the 'wav' key
wav_output_1 = out['wav'] # <--- CHANGE 2: Extract the audio from the dictionary
print("Synthesis complete!")
# For testing, save the output to a file
import soundfile as sf
sf.write("output_test.wav", wav_output_1, 24000)
print("Test audio saved to output_test.wav")
else:
print("Could not proceed with synthesis because speaker latents failed to compute.")
\ No newline at end of file
import torch
import soundfile as sf
import io
import warnings
import logging
import numpy as np
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from config import SUPPORTED_MODELS
from schemas import SynthesisRequest
from tts_service import TTSModel
from utils import split_text_into_chunks
# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
# --- Application Setup ---
app = FastAPI()
# Dictionary for application's state (the loaded models)
models = {}
# --- Model Loading on Startup ---
@app.on_event("startup")
def load_all_models():
use_gpu = torch.cuda.is_available()
print(f"GPU Available: {use_gpu}")
for lang, config in SUPPORTED_MODELS.items():
model = TTSModel(config, use_gpu=use_gpu)
model.load()
models[lang] = model
# --- API Endpoint ---
@app.post("/synthesize")
async def synthesize(request: SynthesisRequest):
# Select the correct model from our state dictionary
model = models.get(request.language)
if not model or not model.is_loaded:
return JSONResponse(content={"error": f"The model for language '{request.language}' is not available."}, status_code=503)
try:
# Set character limits with a safety buffer
char_limit = 140 if request.language == "ar" else 220
text_chunks = split_text_into_chunks(request.text, char_limit)
print(f"Text split into {len(text_chunks)} chunks.")
all_audio_chunks = []
silence_samples = np.zeros(int(24000 * 300 / 1000), dtype=np.float32)
for i, chunk in enumerate(text_chunks):
print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
# Use our powerful OOP model object to synthesize
audio_chunk = model.synthesize_chunk(chunk)
all_audio_chunks.append(audio_chunk)
if i < len(text_chunks) - 1:
all_audio_chunks.append(silence_samples)
final_audio = np.concatenate(all_audio_chunks)
buffer = io.BytesIO()
sf.write(buffer, final_audio, 24000, format='WAV')
buffer.seek(0)
return StreamingResponse(buffer, media_type="audio/wav")
except Exception as e:
print(f"An error occurred during audio generation: {e}")
return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
\ No newline at end of file
import os
import torch
import soundfile as sf
import io
import warnings
import logging
import numpy as np
import nltk
from typing import Literal
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.api import TTS
# --- NEW & IMPROVED: Suppress Harmless Warnings and Logs ---
# 1. Suppress UserWarnings and FutureWarnings from various libraries.
# This handles the torchaudio warnings and many from transformers.
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
# 2. Set the logging level for the 'transformers' library to ERROR.
# This will hide the informational messages (like the one about GPT2InferenceModel)
# without suppressing actual errors. This is the most effective way to clean the log.
logging.getLogger("transformers").setLevel(logging.ERROR)
# --- Application Setup ---
app = FastAPI()
# --- Global Variables for Models and Speaker Latents ---
model_ar = None
gpt_cond_latent_ar = None
speaker_embedding_ar = None
tts_en = None
gpt_cond_latent_en = None
speaker_embedding_en = None
# --- Text Splitting Helper Function (Unchanged) ---
def split_text_into_chunks(text: str, max_chars: int, language: str):
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
if len(sentence) > max_chars:
words = sentence.split()
temp_chunk = ""
for word in words:
if len(temp_chunk) + len(word) + 1 <= max_chars:
temp_chunk += word + " "
else:
chunks.append(temp_chunk.strip())
temp_chunk = word + " "
if temp_chunk:
chunks.append(temp_chunk.strip())
else:
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# --- Model Loading (Unchanged) ---
@app.on_event("startup")
def load_models():
global model_ar, gpt_cond_latent_ar, speaker_embedding_ar
global tts_en, gpt_cond_latent_en, speaker_embedding_en
use_gpu = torch.cuda.is_available()
print(f"GPU Available: {use_gpu}")
print("Server starting up: Loading the ARABIC TTS model...")
try:
CONFIG_FILE_PATH_AR = './model/EGTTS-V0.1/config.json'
VOCAB_FILE_PATH_AR = './model/EGTTS-V0.1/vocab.json'
MODEL_PATH_AR = './model/EGTTS-V0.1/'
SPEAKER_AUDIO_PATH_AR = 'calm_anan_1.wav'
config_ar = XttsConfig()
config_ar.load_json(CONFIG_FILE_PATH_AR)
model_ar = Xtts.init_from_config(config_ar)
model_ar.load_checkpoint(config_ar, checkpoint_dir=MODEL_PATH_AR, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH_AR)
if use_gpu: model_ar.cuda()
print("Computing ARABIC speaker characteristics...")
gpt_cond_latent_ar, speaker_embedding_ar = model_ar.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_AR])
print("ARABIC model loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load the ARABIC model. Error: {e}")
model_ar = None
print("Server starting up: Loading the ENGLISH TTS model...")
try:
tts_en = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
SPEAKER_AUDIO_PATH_EN = "calm_anan_1.wav"
print("Computing ENGLISH speaker characteristics...")
gpt_cond_latent_en, speaker_embedding_en = tts_en.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_EN])
print("ENGLISH model loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load the ENGLISH model. Error: {e}")
tts_en = None
# --- Pydantic Model for Request Body (Unchanged) ---
class SynthesisRequest(BaseModel):
text: str
language: Literal["ar", "en"]
# --- The Unified API Endpoint (Unchanged) ---
@app.post("/synthesize")
async def synthesize(request: SynthesisRequest):
if request.language == "ar" and model_ar is None:
return JSONResponse(content={"error": "The Arabic model is not loaded."}, status_code=503)
if request.language == "en" and tts_en is None:
return JSONResponse(content={"error": "The English model is not loaded."}, status_code=503)
try:
if request.language == "ar":
char_limit = 140
else:
char_limit = 220
text_chunks = split_text_into_chunks(request.text, char_limit, request.language)
print(f"Text split into {len(text_chunks)} chunks.")
all_audio_chunks = []
silence_duration_ms = 300
silence_samples = np.zeros(int(24000 * silence_duration_ms / 1000), dtype=np.float32)
for i, chunk in enumerate(text_chunks):
print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
out = None
if request.language == "ar":
out = model_ar.inference(chunk, "ar", gpt_cond_latent_ar, speaker_embedding_ar, temperature=0.1)
elif request.language == "en":
out = tts_en.synthesizer.tts_model.inference(text=chunk, language="en", speaker_embedding=speaker_embedding_en, gpt_cond_latent=gpt_cond_latent_en)
all_audio_chunks.append(out["wav"])
if i < len(text_chunks) - 1:
all_audio_chunks.append(silence_samples)
final_audio = np.concatenate(all_audio_chunks)
buffer = io.BytesIO()
sf.write(buffer, final_audio, 24000, format='WAV')
buffer.seek(0)
return StreamingResponse(buffer, media_type="audio/wav")
except Exception as e:
print(f"An error occurred during audio generation: {e}")
return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
# ssh -i "SalmaAI.pem" -L 5000:localhost:5000 ubuntu@ec2-18-193-226-85.eu-central-1.compute.amazonaws.com
from pydantic import BaseModel
from typing import Literal
class SynthesisRequest(BaseModel):
text: str
language: Literal["ar", "en"]
\ No newline at end of file
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
CONFIG_FILE_PATH = './model/EGTTS-V0.1/config.json'
VOCAB_FILE_PATH = './model/EGTTS-V0.1/vocab.json'
MODEL_PATH = './model/EGTTS-V0.1/'
print("Loading model...")
config = XttsConfig()
config.load_json(CONFIG_FILE_PATH)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir=MODEL_PATH, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH)
# move model to GPU if available
if torch.cuda.is_available():
model.cuda()
print("Model moved to GPU.")
# compute speaker latents
SPEAKER_AUDIO_PATH = 'calm_anan_1.wav'
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
text = """
انا عنان مؤسس شرع العلوم وانا هنا عشان اساعدك تتعلم اي حاجة عايز تتعلمها فالعلوم
انا شرع العلوم موقع تعليمي بيقدم كورسات مجانية في مجالات متعددة زي البرمجة، التصميم، التسويق، وغيرها
كل اللي عليك تعمله تزور الموقع وتختار الكورس اللي يناسبك وتبدأ تتعلم على طول من غير اي تكلفة
تحب تتعلم ايه النهاردة؟
اي اسئلة عندك انا هنا عشان اساعدك
اي استفسار انا تحت امرك
اسال زي ما انت عايز
في اي مجال تحب تتعلم اكتر؟
"""
print("Inference...")
out = model.inference(
text,
"ar",
gpt_cond_latent,
speaker_embedding,
temperature=0.1,
)
AUDIO_OUTPUT_PATH = "output_audio.wav"
import soundfile as sf
sf.write(AUDIO_OUTPUT_PATH, out["wav"], 24000)
import torch
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from config import TTSConfig
class TTSModel:
"""
A class that encapsulates a Coqui TTS model, handling loading,
speaker latent calculation, and inference. This is the core OOP abstraction.
"""
def __init__(self, config: TTSConfig, use_gpu: bool = False):
self.config = config
self.use_gpu = use_gpu
self.model = None
self.gpt_cond_latent = None
self.speaker_embedding = None
self.is_loaded = False
def load(self):
"""Loads the model and computes speaker latents."""
print(f"Loading model for language: '{self.config.language}'...")
try:
# Handle the two different ways of loading models
if self.config.language == "ar":
# Local, fine-tuned model
conf = XttsConfig()
conf.load_json(self.config.config_path)
self.model = Xtts.init_from_config(conf)
self.model.load_checkpoint(
conf,
checkpoint_dir=self.config.model_name_or_path,
vocab_path=self.config.vocab_path,
use_deepspeed=False
)
if self.use_gpu:
self.model.cuda()
# Calculate latents using the model's method
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(
audio_path=[self.config.speaker_wav]
)
else:
# High-level API model
api_model = TTS(model_name=self.config.model_name_or_path, gpu=self.use_gpu)
self.model = api_model.synthesizer.tts_model
# Calculate latents using the API model's method
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(
audio_path=[self.config.speaker_wav]
)
self.is_loaded = True
print(f"Model for '{self.config.language}' loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load model for '{self.config.language}'. Error: {e}")
self.is_loaded = False
def synthesize_chunk(self, text: str):
"""Runs inference on a single text chunk."""
if not self.is_loaded:
raise RuntimeError(f"Model for language '{self.config.language}' is not loaded.")
out = self.model.inference(
text=text,
language=self.config.language,
speaker_embedding=self.speaker_embedding,
gpt_cond_latent=self.gpt_cond_latent,
temperature=0.1
)
return out["wav"]
\ No newline at end of file
import nltk
def split_text_into_chunks(text: str, max_chars: int):
"""
Splits a long text into smaller chunks based on sentence boundaries,
ensuring no chunk exceeds the max_chars limit.
"""
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
if len(sentence) > max_chars:
words = sentence.split()
temp_chunk = ""
for word in words:
if len(temp_chunk) + len(word) + 1 <= max_chars:
temp_chunk += word + " "
else:
chunks.append(temp_chunk.strip())
temp_chunk = word + " "
if temp_chunk:
chunks.append(temp_chunk.strip())
else:
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment