deployes TTS code on aws

parent 58fc13a9
from huggingface_hub import snapshot_download
snapshot_download(
repo_id="OmarSamir/EGTTS-V0.1",
repo_type="model",
local_dir="./model/EGTTS-V0.1",
local_dir_use_symlinks=False
)
from TTS.api import TTS
import torch
# --- 1. Server Startup: Load Model ---
use_gpu = torch.cuda.is_available()
print(f"GPU Available: {use_gpu}")
# Initialize the model directly on the GPU if available
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
print("TTS model loaded.")
# --- 2. Server Startup: Pre-calculate Speaker Latents ---
SPEAKER_AUDIO_PATH = "calm_anan_1.wav" # Make sure this path is correct
print(f"Pre-calculating speaker latents from: {SPEAKER_AUDIO_PATH}")
try:
gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
print("Speaker latents calculated and stored successfully.")
except Exception as e:
print(f"Error calculating speaker latents: {e}")
gpt_cond_latent, speaker_embedding = None, None
# --- 3. Inside Your API Endpoint (Handling a Request) ---
if gpt_cond_latent is not None:
text1 = """This is the first sentence. It will use the pre-calculated voice.
This is the second sentence. It will use the same voice as the first one.
This is the third sentence. Again, same voice.
This is the fourth sentence. Still the same voice.
This is the fifth sentence. still the same voice.
This is the sixth sentence. still the same voice.
This is the seventh sentence. still the same voice.
This is the eighth sentence. still the same voice.
This is the ninth sentence. still the same voice.
"""
print(f"Synthesizing: '{text1}'")
# Call the .inference() method on the actual model object
# This bypasses all the simple checks and uses our advanced parameters directly.
out = tts.synthesizer.tts_model.inference( # <--- CHANGE 1: Call .inference()
text=text1,
language="en",
speaker_embedding=speaker_embedding,
gpt_cond_latent=gpt_cond_latent
)
# The .inference() method returns a dictionary, the audio is in the 'wav' key
wav_output_1 = out['wav'] # <--- CHANGE 2: Extract the audio from the dictionary
print("Synthesis complete!")
# For testing, save the output to a file
import soundfile as sf
sf.write("output_test.wav", wav_output_1, 24000)
print("Test audio saved to output_test.wav")
else:
print("Could not proceed with synthesis because speaker latents failed to compute.")
\ No newline at end of file
import os
import torch
import soundfile as sf
import io
import warnings
import logging
import numpy as np
import nltk
from typing import Literal
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.api import TTS
# --- NEW & IMPROVED: Suppress Harmless Warnings and Logs ---
# 1. Suppress UserWarnings and FutureWarnings from various libraries.
# This handles the torchaudio warnings and many from transformers.
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
# 2. Set the logging level for the 'transformers' library to ERROR.
# This will hide the informational messages (like the one about GPT2InferenceModel)
# without suppressing actual errors. This is the most effective way to clean the log.
logging.getLogger("transformers").setLevel(logging.ERROR)
# --- Application Setup ---
app = FastAPI()
# --- Global Variables for Models and Speaker Latents ---
model_ar = None
gpt_cond_latent_ar = None
speaker_embedding_ar = None
tts_en = None
gpt_cond_latent_en = None
speaker_embedding_en = None
# --- Text Splitting Helper Function (Unchanged) ---
def split_text_into_chunks(text: str, max_chars: int, language: str):
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) + 1 <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
if len(sentence) > max_chars:
words = sentence.split()
temp_chunk = ""
for word in words:
if len(temp_chunk) + len(word) + 1 <= max_chars:
temp_chunk += word + " "
else:
chunks.append(temp_chunk.strip())
temp_chunk = word + " "
if temp_chunk:
chunks.append(temp_chunk.strip())
else:
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# --- Model Loading (Unchanged) ---
@app.on_event("startup")
def load_models():
global model_ar, gpt_cond_latent_ar, speaker_embedding_ar
global tts_en, gpt_cond_latent_en, speaker_embedding_en
use_gpu = torch.cuda.is_available()
print(f"GPU Available: {use_gpu}")
print("Server starting up: Loading the ARABIC TTS model...")
try:
CONFIG_FILE_PATH_AR = './model/EGTTS-V0.1/config.json'
VOCAB_FILE_PATH_AR = './model/EGTTS-V0.1/vocab.json'
MODEL_PATH_AR = './model/EGTTS-V0.1/'
SPEAKER_AUDIO_PATH_AR = 'calm_anan_1.wav'
config_ar = XttsConfig()
config_ar.load_json(CONFIG_FILE_PATH_AR)
model_ar = Xtts.init_from_config(config_ar)
model_ar.load_checkpoint(config_ar, checkpoint_dir=MODEL_PATH_AR, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH_AR)
if use_gpu: model_ar.cuda()
print("Computing ARABIC speaker characteristics...")
gpt_cond_latent_ar, speaker_embedding_ar = model_ar.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_AR])
print("ARABIC model loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load the ARABIC model. Error: {e}")
model_ar = None
print("Server starting up: Loading the ENGLISH TTS model...")
try:
tts_en = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
SPEAKER_AUDIO_PATH_EN = "calm_anan_1.wav"
print("Computing ENGLISH speaker characteristics...")
gpt_cond_latent_en, speaker_embedding_en = tts_en.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_EN])
print("ENGLISH model loaded successfully.")
except Exception as e:
print(f"FATAL ERROR: Could not load the ENGLISH model. Error: {e}")
tts_en = None
# --- Pydantic Model for Request Body (Unchanged) ---
class SynthesisRequest(BaseModel):
text: str
language: Literal["ar", "en"]
# --- The Unified API Endpoint (Unchanged) ---
@app.post("/synthesize")
async def synthesize(request: SynthesisRequest):
if request.language == "ar" and model_ar is None:
return JSONResponse(content={"error": "The Arabic model is not loaded."}, status_code=503)
if request.language == "en" and tts_en is None:
return JSONResponse(content={"error": "The English model is not loaded."}, status_code=503)
try:
if request.language == "ar":
char_limit = 140
else:
char_limit = 220
text_chunks = split_text_into_chunks(request.text, char_limit, request.language)
print(f"Text split into {len(text_chunks)} chunks.")
all_audio_chunks = []
silence_duration_ms = 300
silence_samples = np.zeros(int(24000 * silence_duration_ms / 1000), dtype=np.float32)
for i, chunk in enumerate(text_chunks):
print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
out = None
if request.language == "ar":
out = model_ar.inference(chunk, "ar", gpt_cond_latent_ar, speaker_embedding_ar, temperature=0.1)
elif request.language == "en":
out = tts_en.synthesizer.tts_model.inference(text=chunk, language="en", speaker_embedding=speaker_embedding_en, gpt_cond_latent=gpt_cond_latent_en)
all_audio_chunks.append(out["wav"])
if i < len(text_chunks) - 1:
all_audio_chunks.append(silence_samples)
final_audio = np.concatenate(all_audio_chunks)
buffer = io.BytesIO()
sf.write(buffer, final_audio, 24000, format='WAV')
buffer.seek(0)
return StreamingResponse(buffer, media_type="audio/wav")
except Exception as e:
print(f"An error occurred during audio generation: {e}")
return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
# ssh -i "SalmaAI.pem" -L 5000:localhost:5000 ubuntu@ec2-18-193-226-85.eu-central-1.compute.amazonaws.com
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
CONFIG_FILE_PATH = './model/EGTTS-V0.1/config.json'
VOCAB_FILE_PATH = './model/EGTTS-V0.1/vocab.json'
MODEL_PATH = './model/EGTTS-V0.1/'
print("Loading model...")
config = XttsConfig()
config.load_json(CONFIG_FILE_PATH)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir=MODEL_PATH, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH)
# move model to GPU if available
if torch.cuda.is_available():
model.cuda()
print("Model moved to GPU.")
# compute speaker latents
SPEAKER_AUDIO_PATH = 'calm_anan_1.wav'
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
text = """
انا عنان مؤسس شرع العلوم وانا هنا عشان اساعدك تتعلم اي حاجة عايز تتعلمها فالعلوم
انا شرع العلوم موقع تعليمي بيقدم كورسات مجانية في مجالات متعددة زي البرمجة، التصميم، التسويق، وغيرها
كل اللي عليك تعمله تزور الموقع وتختار الكورس اللي يناسبك وتبدأ تتعلم على طول من غير اي تكلفة
تحب تتعلم ايه النهاردة؟
اي اسئلة عندك انا هنا عشان اساعدك
اي استفسار انا تحت امرك
اسال زي ما انت عايز
في اي مجال تحب تتعلم اكتر؟
"""
print("Inference...")
out = model.inference(
text,
"ar",
gpt_cond_latent,
speaker_embedding,
temperature=0.1,
)
AUDIO_OUTPUT_PATH = "output_audio.wav"
import soundfile as sf
sf.write(AUDIO_OUTPUT_PATH, out["wav"], 24000)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment