deployes TTS code on aws

6715bfd4 · SalmaMohammedHamedMustafa · 58fc13a9 · 58fc13a9 · 58fc13a9 · 58fc13a9
Commit 6715bfd4 authored Sep 29, 2025 by SalmaMohammedHamedMustafa
4 changed files
--- a/TTS/machine_code/download_model.py
+++ b/TTS/machine_code/download_model.py
-from huggingface_hub import snapshot_download
-
-snapshot_download(
-    repo_id="OmarSamir/EGTTS-V0.1",
-    repo_type="model",
-    local_dir="./model/EGTTS-V0.1",
-    local_dir_use_symlinks=False
-)
--- a/TTS/machine_code/download_model_en.py
+++ b/TTS/machine_code/download_model_en.py
-from TTS.api import TTS
-import torch
-
-# --- 1. Server Startup: Load Model ---
-use_gpu = torch.cuda.is_available()
-print(f"GPU Available: {use_gpu}")
-
-# Initialize the model directly on the GPU if available
-tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
-print("TTS model loaded.")
-
-
-# --- 2. Server Startup: Pre-calculate Speaker Latents ---
-SPEAKER_AUDIO_PATH = "calm_anan_1.wav" # Make sure this path is correct
-print(f"Pre-calculating speaker latents from: {SPEAKER_AUDIO_PATH}")
-
-try:
-    gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
-    print("Speaker latents calculated and stored successfully.")
-except Exception as e:
-    print(f"Error calculating speaker latents: {e}")
-    gpt_cond_latent, speaker_embedding = None, None
-
-
-# --- 3. Inside Your API Endpoint (Handling a Request) ---
-if gpt_cond_latent is not None:
-    text1 = """This is the first sentence. It will use the pre-calculated voice.
-This is the second sentence. It will use the same voice as the first one.
-This is the third sentence. Again, same voice.
-This is the fourth sentence. Still the same voice.
-This is the fifth sentence. still the same voice.
-This is the sixth sentence. still the same voice.
-This is the seventh sentence. still the same voice.
-This is the eighth sentence. still the same voice.
-This is the ninth sentence. still the same voice.
-"""
-    print(f"Synthesizing: '{text1}'")
-
-    # Call the .inference() method on the actual model object
-    # This bypasses all the simple checks and uses our advanced parameters directly.
-    out = tts.synthesizer.tts_model.inference(   # <--- CHANGE 1: Call .inference()
-        text=text1,
-        language="en",
-        speaker_embedding=speaker_embedding,
-        gpt_cond_latent=gpt_cond_latent
-    )
-    
-    # The .inference() method returns a dictionary, the audio is in the 'wav' key
-    wav_output_1 = out['wav'] # <--- CHANGE 2: Extract the audio from the dictionary
-
-    print("Synthesis complete!")
-
-    # For testing, save the output to a file
-    import soundfile as sf
-    sf.write("output_test.wav", wav_output_1, 24000)
-    print("Test audio saved to output_test.wav")
-
-else:
-    print("Could not proceed with synthesis because speaker latents failed to compute.")
\ No newline at end of file
--- a/TTS/machine_code/main_old.py
+++ b/TTS/machine_code/main_old.py
-import os
-import torch
-import soundfile as sf
-import io
-import warnings
-import logging  
-import numpy as np
-import nltk
-from typing import Literal
-
-from fastapi import FastAPI
-from fastapi.responses import StreamingResponse, JSONResponse
-from pydantic import BaseModel
-
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-from TTS.api import TTS
-
-# --- NEW & IMPROVED: Suppress Harmless Warnings and Logs ---
-
-# 1. Suppress UserWarnings and FutureWarnings from various libraries.
-# This handles the torchaudio warnings and many from transformers.
-warnings.filterwarnings('ignore', category=UserWarning)
-warnings.filterwarnings('ignore', category=FutureWarning)
-
-# 2. Set the logging level for the 'transformers' library to ERROR.
-# This will hide the informational messages (like the one about GPT2InferenceModel)
-# without suppressing actual errors. This is the most effective way to clean the log.
-logging.getLogger("transformers").setLevel(logging.ERROR)
-
-
-# --- Application Setup ---
-app = FastAPI()
-
-# --- Global Variables for Models and Speaker Latents ---
-model_ar = None
-gpt_cond_latent_ar = None
-speaker_embedding_ar = None
-
-tts_en = None
-gpt_cond_latent_en = None
-speaker_embedding_en = None
-
-# --- Text Splitting Helper Function (Unchanged) ---
-def split_text_into_chunks(text: str, max_chars: int, language: str):
-    sentences = nltk.sent_tokenize(text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) + 1 <= max_chars:
-            current_chunk += sentence + " "
-        else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            if len(sentence) > max_chars:
-                words = sentence.split()
-                temp_chunk = ""
-                for word in words:
-                    if len(temp_chunk) + len(word) + 1 <= max_chars:
-                        temp_chunk += word + " "
-                    else:
-                        chunks.append(temp_chunk.strip())
-                        temp_chunk = word + " "
-                if temp_chunk:
-                    chunks.append(temp_chunk.strip())
-            else:
-                current_chunk = sentence + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-
-# --- Model Loading (Unchanged) ---
-@app.on_event("startup")
-def load_models():
-    global model_ar, gpt_cond_latent_ar, speaker_embedding_ar
-    global tts_en, gpt_cond_latent_en, speaker_embedding_en
-    use_gpu = torch.cuda.is_available()
-    print(f"GPU Available: {use_gpu}")
-    print("Server starting up: Loading the ARABIC TTS model...")
-    try:
-        CONFIG_FILE_PATH_AR = './model/EGTTS-V0.1/config.json'
-        VOCAB_FILE_PATH_AR = './model/EGTTS-V0.1/vocab.json'
-        MODEL_PATH_AR = './model/EGTTS-V0.1/'
-        SPEAKER_AUDIO_PATH_AR = 'calm_anan_1.wav'
-        config_ar = XttsConfig()
-        config_ar.load_json(CONFIG_FILE_PATH_AR)
-        model_ar = Xtts.init_from_config(config_ar)
-        model_ar.load_checkpoint(config_ar, checkpoint_dir=MODEL_PATH_AR, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH_AR)
-        if use_gpu: model_ar.cuda()
-        print("Computing ARABIC speaker characteristics...")
-        gpt_cond_latent_ar, speaker_embedding_ar = model_ar.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_AR])
-        print("ARABIC model loaded successfully.")
-    except Exception as e:
-        print(f"FATAL ERROR: Could not load the ARABIC model. Error: {e}")
-        model_ar = None
-    print("Server starting up: Loading the ENGLISH TTS model...")
-    try:
-        tts_en = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=use_gpu)
-        SPEAKER_AUDIO_PATH_EN = "calm_anan_1.wav" 
-        print("Computing ENGLISH speaker characteristics...")
-        gpt_cond_latent_en, speaker_embedding_en = tts_en.synthesizer.tts_model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH_EN])
-        print("ENGLISH model loaded successfully.")
-    except Exception as e:
-        print(f"FATAL ERROR: Could not load the ENGLISH model. Error: {e}")
-        tts_en = None
-
-# --- Pydantic Model for Request Body (Unchanged) ---
-class SynthesisRequest(BaseModel):
-    text: str
-    language: Literal["ar", "en"]
-
-# --- The Unified API Endpoint (Unchanged) ---
-@app.post("/synthesize")
-async def synthesize(request: SynthesisRequest):
-    if request.language == "ar" and model_ar is None:
-        return JSONResponse(content={"error": "The Arabic model is not loaded."}, status_code=503)
-    if request.language == "en" and tts_en is None:
-        return JSONResponse(content={"error": "The English model is not loaded."}, status_code=503)
-
-    try:
-        if request.language == "ar":
-            char_limit = 140
-        else:
-            char_limit = 220
-            
-        text_chunks = split_text_into_chunks(request.text, char_limit, request.language)
-        print(f"Text split into {len(text_chunks)} chunks.")
-
-        all_audio_chunks = []
-        silence_duration_ms = 300
-        silence_samples = np.zeros(int(24000 * silence_duration_ms / 1000), dtype=np.float32)
-
-        for i, chunk in enumerate(text_chunks):
-            print(f"Synthesizing chunk {i+1}/{len(text_chunks)}: '{chunk}'")
-            out = None
-            if request.language == "ar":
-                out = model_ar.inference(chunk, "ar", gpt_cond_latent_ar, speaker_embedding_ar, temperature=0.1)
-            elif request.language == "en":
-                out = tts_en.synthesizer.tts_model.inference(text=chunk, language="en", speaker_embedding=speaker_embedding_en, gpt_cond_latent=gpt_cond_latent_en)
-            
-            all_audio_chunks.append(out["wav"])
-            if i < len(text_chunks) - 1:
-                 all_audio_chunks.append(silence_samples)
-
-        final_audio = np.concatenate(all_audio_chunks)
-
-        buffer = io.BytesIO()
-        sf.write(buffer, final_audio, 24000, format='WAV')
-        buffer.seek(0)
-
-        return StreamingResponse(buffer, media_type="audio/wav")
-
-    except Exception as e:
-        print(f"An error occurred during audio generation: {e}")
-        return JSONResponse(content={"error": "Failed to generate audio"}, status_code=500)
-# ssh -i "SalmaAI.pem" -L 5000:localhost:5000 ubuntu@ec2-18-193-226-85.eu-central-1.compute.amazonaws.com
--- a/TTS/machine_code/setup_model.py
+++ b/TTS/machine_code/setup_model.py
-import os
-import torch
-import torchaudio
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-
-CONFIG_FILE_PATH = './model/EGTTS-V0.1/config.json'
-VOCAB_FILE_PATH = './model/EGTTS-V0.1/vocab.json'
-MODEL_PATH = './model/EGTTS-V0.1/'
-
-print("Loading model...")
-config = XttsConfig()
-config.load_json(CONFIG_FILE_PATH)
-model = Xtts.init_from_config(config)
-model.load_checkpoint(config, checkpoint_dir=MODEL_PATH, use_deepspeed=False, vocab_path=VOCAB_FILE_PATH)
-
-# move model to GPU if available
-
-if torch.cuda.is_available():
-    model.cuda()
-    print("Model moved to GPU.")
-
-    
-# compute speaker latents
-SPEAKER_AUDIO_PATH = 'calm_anan_1.wav'
-print("Computing speaker latents...")
-gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_AUDIO_PATH])
-
-
-text = """
-انا عنان مؤسس شرع العلوم وانا هنا عشان اساعدك تتعلم اي حاجة عايز تتعلمها فالعلوم
-انا شرع العلوم موقع تعليمي بيقدم كورسات مجانية في مجالات متعددة زي البرمجة، التصميم، التسويق، وغيرها
-كل اللي عليك تعمله تزور الموقع وتختار الكورس اللي يناسبك وتبدأ تتعلم على طول من غير اي تكلفة
-تحب تتعلم ايه النهاردة؟
-اي اسئلة عندك انا هنا عشان اساعدك
-اي استفسار انا تحت امرك
-اسال زي ما انت عايز
-في اي مجال تحب تتعلم اكتر؟
-"""
-print("Inference...")
-out = model.inference(
-    text,
-    "ar",
-    gpt_cond_latent,
-    speaker_embedding,
-    temperature=0.1,
-)
-
-AUDIO_OUTPUT_PATH = "output_audio.wav"
-
-import soundfile as sf
-sf.write(AUDIO_OUTPUT_PATH, out["wav"], 24000)
-