deployes TTS code on aws

58fc13a9 · SalmaMohammedHamedMustafa · 9319d253 · 58fc13a9 · 58fc13a9 · 58fc13a9
Commit 58fc13a9 authored Sep 29, 2025 by SalmaMohammedHamedMustafa
9 changed files
--- a/TTS/machine_code/config.py
+++ b/TTS/machine_code/config.py
+from pydantic import BaseModel
+
+class TTSConfig(BaseModel):
+    """Holds configuration for a single TTS model."""
+    language: str
+    model_name_or_path: str
+    speaker_wav: str
+    config_path: str | None = None
+    vocab_path: str | None = None
+
+
+ARABIC_MODEL_CONFIG = TTSConfig(
+    language="ar",
+    model_name_or_path="./model/EGTTS-V0.1/",
+    speaker_wav="calm_anan_1.wav",
+    config_path="./model/EGTTS-V0.1/config.json",
+    vocab_path="./model/EGTTS-V0.1/vocab.json"
+)
+
+ENGLISH_MODEL_CONFIG = TTSConfig(
+    language="en",
+    model_name_or_path="tts_models/multilingual/multi-dataset/xtts_v2",
+    speaker_wav="calm_anan_1.wav"
+)
+
+
+SUPPORTED_MODELS = {
+    "ar": ARABIC_MODEL_CONFIG,
+    "en": ENGLISH_MODEL_CONFIG,
+}
\ No newline at end of file
--- a/TTS/machine_code/download_model.py
+++ b/TTS/machine_code/download_model.py
+from huggingface_hub import snapshot_download
+
+snapshot_download(
+    repo_id="OmarSamir/EGTTS-V0.1",
+    repo_type="model",
+    local_dir="./model/EGTTS-V0.1",
+    local_dir_use_symlinks=False
+)
--- a/TTS/machine_code/download_model_en.py
+++ b/TTS/machine_code/download_model_en.py
+from TTS.api import TTS
+import torch
+
+# --- 1. Server Startup: Load Model ---
+use_gpu = torch.cuda.is_available()
+print(f"GPU Available: {use_gpu}")
+
+# Initialize the model directly on the GPU if available
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
+print("TTS model loaded.")
+
+
+# --- 2. Server Startup: Pre-calculate Speaker Latents ---
+SPEAKER_AUDIO_PATH = "calm_anan_1.wav" # Make sure this path is correct
+print(f"Pre-calculating speaker latents from: {SPEAKER_AUDIO_PATH}")
+
+try:
+    gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
+    print("Speaker latents calculated and stored successfully.")
+except Exception as e:
+    print(f"Error calculating speaker latents: {e}")
+    gpt_cond_latent, speaker_embedding = None, None
+
+
+# --- 3. Inside Your API Endpoint (Handling a Request) ---
+if gpt_cond_latent is not None:
+    text1 = """This is the first sentence. It will use the pre-calculated voice.
+This is the second sentence. It will use the same voice as the first one.
+This is the third sentence. Again, same voice.
+This is the fourth sentence. Still the same voice.
+This is the fifth sentence. still the same voice.
+This is the sixth sentence. still the same voice.
+This is the seventh sentence. still the same voice.
+This is the eighth sentence. still the same voice.
+This is the ninth sentence. still the same voice.
+"""
+    print(f"Synthesizing: '{text1}'")
+
+    # Call the .inference() method on the actual model object
+    # This bypasses all the simple checks and uses our advanced parameters directly.
+    out = tts.synthesizer.tts_model.inference(   # <--- CHANGE 1: Call .inference()
+        text=text1,
+        language="en",
+        speaker_embedding=speaker_embedding,
+        gpt_cond_latent=gpt_cond_latent
+    )
+    
+    # The .inference() method returns a dictionary, the audio is in the 'wav' key
+    wav_output_1 = out['wav'] # <--- CHANGE 2: Extract the audio from the dictionary
+
+    print("Synthesis complete!")
+
+    # For testing, save the output to a file
+    import soundfile as sf
+    sf.write("output_test.wav", wav_output_1, 24000)
+    print("Test audio saved to output_test.wav")
+
+else:
+    print("Could not proceed with synthesis because speaker latents failed to compute.")
\ No newline at end of file
--- a/TTS/machine_code/main.py
+++ b/TTS/machine_code/main.py
+import torch
+import soundfile as sf
+import io
+import warnings
+import logging
+import numpy as np
+
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse, JSONResponse
+
+from config import SUPPORTED_MODELS
+from schemas import SynthesisRequest
+from tts_service import TTSModel
+from utils import split_text_into_chunks
+
+# --- Suppress Warnings ---
+warnings.filterwarnings('ignore', category=UserWarning)
+warnings.filterwarnings('ignore', category=FutureWarning)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+
+# --- Application Setup ---
+app = FastAPI()
+
+# Dictionary for application's state (the loaded models)
+models = {}
+
+# --- Model Loading on Startup ---
+@app.on_event("startup")
+def load_all_models():
+    use_gpu = torch.cuda.is_available()
+    print(f"GPU Available: {use_gpu}")
+
+    for lang, config in SUPPORTED_MODELS.items():
+        model = TTSModel(config, use_gpu=use_gpu)
+        model.load()
+        models[lang] = model
+
+# ---  API Endpoint ---
+@app.post("/synthesize")
+async def synthesize(request: SynthesisRequest):
+    # Select the correct model from our state dictionary
+    model = models.get(request.language)
+    
+    if not model or not model.is_loaded:
+        return JSONResponse(content={"error": f"The model for language '{request.language}' is not available."}, status_code=503)
+
+    try:
+        # Set character limits with a safety buffer
+        char_limit = 140 if request.language == "ar" else 220
+        
+        text_chunks = split_text_into_chunks(request.text, char_limit)
+        print(f"Text split into {len(text_chunks)} chunks.")
+
+        all_audio_chunks = []
+        silence_samples = np.zeros(int(24000 * 300 / 1000), dtype=np.float32)
+
+        for i, chunk in enumerate(text_chunks):
+            print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
+            # Use our powerful OOP model object to synthesize
+            audio_chunk = model.synthesize_chunk(chunk)
+            
+            all_audio_chunks.append(audio_chunk)
+            if i < len(text_chunks) - 1:
+                all_audio_chunks.append(silence_samples)
+
+        final_audio = np.concatenate(all_audio_chunks)
+
+        buffer = io.BytesIO()
+        sf.write(buffer, final_audio, 24000, format='WAV')
+        buffer.seek(0)
+
+        return StreamingResponse(buffer, media_type="audio/wav")
+
+    except Exception as e:
+        print(f"An error occurred during audio generation: {e}")
+        return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
\ No newline at end of file
--- a/TTS/machine_code/main_old.py
+++ b/TTS/machine_code/main_old.py
+import os
+import torch
+import soundfile as sf
+import io
+import warnings
+import logging  
+import numpy as np
+import nltk
+from typing import Literal
+
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse, JSONResponse
+from pydantic import BaseModel
+
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.api import TTS
+
+# --- NEW & IMPROVED: Suppress Harmless Warnings and Logs ---
+
+# 1. Suppress UserWarnings and FutureWarnings from various libraries.
+# This handles the torchaudio warnings and many from transformers.
+warnings.filterwarnings('ignore', category=UserWarning)
+warnings.filterwarnings('ignore', category=FutureWarning)
+
+# 2. Set the logging level for the 'transformers' library to ERROR.
+# This will hide the informational messages (like the one about GPT2InferenceModel)
+# without suppressing actual errors. This is the most effective way to clean the log.
+logging.getLogger("transformers").setLevel(logging.ERROR)
+
+
+# --- Application Setup ---
+app = FastAPI()
+
+# --- Global Variables for Models and Speaker Latents ---
+model_ar = None
+gpt_cond_latent_ar = None
+speaker_embedding_ar = None
+
+tts_en = None
+gpt_cond_latent_en = None
+speaker_embedding_en = None
+
+# --- Text Splitting Helper Function (Unchanged) ---
+def split_text_into_chunks(text: str, max_chars: int, language: str):
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) + 1 <= max_chars:
+            current_chunk += sentence + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            if len(sentence) > max_chars:
+                words = sentence.split()
+                temp_chunk = ""
+                for word in words:
+                    if len(temp_chunk) + len(word) + 1 <= max_chars:
+                        temp_chunk += word + " "
+                    else:
+                        chunks.append(temp_chunk.strip())
+                        temp_chunk = word + " "
+                if temp_chunk:
+                    chunks.append(temp_chunk.strip())
+            else:
+                current_chunk = sentence + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+
+# --- Model Loading (Unchanged) ---
+@app.on_event("startup")
+def load_models():
+    global model_ar, gpt_cond_latent_ar, speaker_embedding_ar
+    global tts_en, gpt_cond_latent_en, speaker_embedding_en
+    use_gpu = torch.cuda.is_available()
+    print(f"GPU Available: {use_gpu}")
+    print("Server starting up: Loading the ARABIC TTS model...")
+    try:
+        CONFIG_FILE_PATH_AR = './model/EGTTS-V0.1/config.json'
+        VOCAB_FILE_PATH_AR = './model/EGTTS-V0.1/vocab.json'
+        MODEL_PATH_AR = './model/EGTTS-V0.1/'
+        SPEAKER_AUDIO_PATH_AR = 'calm_anan_1.wav'
+        config_ar = XttsConfig()
+        config_ar.load_json(CONFIG_FILE_PATH_AR)
+        model_ar = Xtts.init_from_config(config_ar)
+        model_ar.load_checkpoint(config_ar, checkpoint_dir=MODEL_PATH_AR, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH_AR)
+        if use_gpu: model_ar.cuda()
+        print("Computing ARABIC speaker characteristics...")
+        gpt_cond_latent_ar, speaker_embedding_ar = model_ar.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_AR])
+        print("ARABIC model loaded successfully.")
+    except Exception as e:
+        print(f"FATAL ERROR: Could not load the ARABIC model. Error: {e}")
+        model_ar = None
+    print("Server starting up: Loading the ENGLISH TTS model...")
+    try:
+        tts_en = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
+        SPEAKER_AUDIO_PATH_EN = "calm_anan_1.wav" 
+        print("Computing ENGLISH speaker characteristics...")
+        gpt_cond_latent_en, speaker_embedding_en = tts_en.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_EN])
+        print("ENGLISH model loaded successfully.")
+    except Exception as e:
+        print(f"FATAL ERROR: Could not load the ENGLISH model. Error: {e}")
+        tts_en = None
+
+# --- Pydantic Model for Request Body (Unchanged) ---
+class SynthesisRequest(BaseModel):
+    text: str
+    language: Literal["ar", "en"]
+
+# --- The Unified API Endpoint (Unchanged) ---
+@app.post("/synthesize")
+async def synthesize(request: SynthesisRequest):
+    if request.language == "ar" and model_ar is None:
+        return JSONResponse(content={"error": "The Arabic model is not loaded."}, status_code=503)
+    if request.language == "en" and tts_en is None:
+        return JSONResponse(content={"error": "The English model is not loaded."}, status_code=503)
+
+    try:
+        if request.language == "ar":
+            char_limit = 140
+        else:
+            char_limit = 220
+            
+        text_chunks = split_text_into_chunks(request.text, char_limit, request.language)
+        print(f"Text split into {len(text_chunks)} chunks.")
+
+        all_audio_chunks = []
+        silence_duration_ms = 300
+        silence_samples = np.zeros(int(24000 * silence_duration_ms / 1000), dtype=np.float32)
+
+        for i, chunk in enumerate(text_chunks):
+            print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
+            out = None
+            if request.language == "ar":
+                out = model_ar.inference(chunk, "ar", gpt_cond_latent_ar, speaker_embedding_ar, temperature=0.1)
+            elif request.language == "en":
+                out = tts_en.synthesizer.tts_model.inference(text=chunk, language="en", speaker_embedding=speaker_embedding_en, gpt_cond_latent=gpt_cond_latent_en)
+            
+            all_audio_chunks.append(out["wav"])
+            if i < len(text_chunks) - 1:
+                 all_audio_chunks.append(silence_samples)
+
+        final_audio = np.concatenate(all_audio_chunks)
+
+        buffer = io.BytesIO()
+        sf.write(buffer, final_audio, 24000, format='WAV')
+        buffer.seek(0)
+
+        return StreamingResponse(buffer, media_type="audio/wav")
+
+    except Exception as e:
+        print(f"An error occurred during audio generation: {e}")
+        return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
+# ssh -i "SalmaAI.pem" -L 5000:localhost:5000 ubuntu@ec2-18-193-226-85.eu-central-1.compute.amazonaws.com
--- a/TTS/machine_code/schemas.py
+++ b/TTS/machine_code/schemas.py
+from pydantic import BaseModel
+from typing import Literal
+
+class SynthesisRequest(BaseModel):
+    text: str
+    language: Literal["ar", "en"]
\ No newline at end of file
--- a/TTS/machine_code/setup_model.py
+++ b/TTS/machine_code/setup_model.py
+import os
+import torch
+import torchaudio
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+
+CONFIG_FILE_PATH = './model/EGTTS-V0.1/config.json'
+VOCAB_FILE_PATH = './model/EGTTS-V0.1/vocab.json'
+MODEL_PATH = './model/EGTTS-V0.1/'
+
+print("Loading model...")
+config = XttsConfig()
+config.load_json(CONFIG_FILE_PATH)
+model = Xtts.init_from_config(config)
+model.load_checkpoint(config, checkpoint_dir=MODEL_PATH, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH)
+
+# move model to GPU if available
+
+if torch.cuda.is_available():
+    model.cuda()
+    print("Model moved to GPU.")
+
+    
+# compute speaker latents
+SPEAKER_AUDIO_PATH = 'calm_anan_1.wav'
+print("Computing speaker latents...")
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
+
+
+text = """
+انا عنان مؤسس شرع العلوم وانا هنا عشان اساعدك تتعلم اي حاجة عايز تتعلمها فالعلوم
+انا شرع العلوم موقع تعليمي بيقدم كورسات مجانية في مجالات متعددة زي البرمجة، التصميم، التسويق، وغيرها
+كل اللي عليك تعمله تزور الموقع وتختار الكورس اللي يناسبك وتبدأ تتعلم على طول من غير اي تكلفة
+تحب تتعلم ايه النهاردة؟
+اي اسئلة عندك انا هنا عشان اساعدك
+اي استفسار انا تحت امرك
+اسال زي ما انت عايز
+في اي مجال تحب تتعلم اكتر؟
+"""
+print("Inference...")
+out = model.inference(
+    text,
+    "ar",
+    gpt_cond_latent,
+    speaker_embedding,
+    temperature=0.1,
+)
+
+AUDIO_OUTPUT_PATH = "output_audio.wav"
+
+import soundfile as sf
+sf.write(AUDIO_OUTPUT_PATH, out["wav"], 24000)
+
--- a/TTS/machine_code/tts_service.py
+++ b/TTS/machine_code/tts_service.py
+import torch
+from TTS.api import TTS
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+
+from config import TTSConfig
+
+class TTSModel:
+    """
+    A class that encapsulates a Coqui TTS model, handling loading,
+    speaker latent calculation, and inference. This is the core OOP abstraction.
+    """
+    def __init__(self, config: TTSConfig, use_gpu: bool = False):
+        self.config = config
+        self.use_gpu = use_gpu
+        self.model = None
+        self.gpt_cond_latent = None
+        self.speaker_embedding = None
+        self.is_loaded = False
+
+    def load(self):
+        """Loads the model and computes speaker latents."""
+        print(f"Loading model for language: '{self.config.language}'...")
+        try:
+            # Handle the two different ways of loading models
+            if self.config.language == "ar":
+                # Local, fine-tuned model
+                conf = XttsConfig()
+                conf.load_json(self.config.config_path)
+                self.model = Xtts.init_from_config(conf)
+                self.model.load_checkpoint(
+                    conf,
+                    checkpoint_dir=self.config.model_name_or_path,
+                    vocab_path=self.config.vocab_path,
+                    use_deepspeed=False
+                )
+                if self.use_gpu:
+                    self.model.cuda()
+                
+                # Calculate latents using the model's method
+                self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(
+                    audio_path=[self.config.speaker_wav]
+                )
+            else:
+                # High-level API model
+                api_model = TTS(model_name=self.config.model_name_or_path, gpu=self.use_gpu)
+                self.model = api_model.synthesizer.tts_model
+                
+                # Calculate latents using the API model's method
+                self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(
+                    audio_path=[self.config.speaker_wav]
+                )
+
+            self.is_loaded = True
+            print(f"Model for '{self.config.language}' loaded successfully.")
+
+        except Exception as e:
+            print(f"FATAL ERROR: Could not load model for '{self.config.language}'. Error: {e}")
+            self.is_loaded = False
+    
+    def synthesize_chunk(self, text: str):
+        """Runs inference on a single text chunk."""
+        if not self.is_loaded:
+            raise RuntimeError(f"Model for language '{self.config.language}' is not loaded.")
+
+        out = self.model.inference(
+            text=text,
+            language=self.config.language,
+            speaker_embedding=self.speaker_embedding,
+            gpt_cond_latent=self.gpt_cond_latent,
+            temperature=0.1
+        )
+        return out["wav"]
\ No newline at end of file
--- a/TTS/machine_code/utils.py
+++ b/TTS/machine_code/utils.py
+import nltk
+
+def split_text_into_chunks(text: str, max_chars: int):
+    """
+    Splits a long text into smaller chunks based on sentence boundaries,
+    ensuring no chunk exceeds the max_chars limit.
+    """
+    sentences = nltk.sent_tokenize(text)
+    
+    chunks = []
+    current_chunk = ""
+    
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) + 1 <= max_chars:
+            current_chunk += sentence + " "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            if len(sentence) > max_chars:
+                words = sentence.split()
+                temp_chunk = ""
+                for word in words:
+                    if len(temp_chunk) + len(word) + 1 <= max_chars:
+                        temp_chunk += word + " "
+                    else:
+                        chunks.append(temp_chunk.strip())
+                        temp_chunk = word + " "
+                if temp_chunk:
+                    chunks.append(temp_chunk.strip())
+            else:
+                current_chunk = sentence + " "
+                
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+        
+    return chunks
\ No newline at end of file