mcq AI builder

6c10aac1 · SalmaMohammedHamedMustafa · 8d42c50c · 6c10aac1 · 6c10aac1 · 6c10aac1
Commit 6c10aac1 authored Oct 26, 2025 by SalmaMohammedHamedMustafa
7 changed files
--- a/self_hosted_env/voice_agent/main.py
+++ b/self_hosted_env/voice_agent/main.py
 import os
 import shutil
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks, logger
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, Response
 from fastapi.staticfiles import StaticFiles
@@ -12,6 +12,7 @@ from pathlib import Path
 import tempfile
 import json
 import pandas as pd
+import logging
 from curriculum_structure import convert_json_to_db_format
 from process_pdf_pipline import run_full_pipeline
@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI):
 def create_app() -> FastAPI:
    # Connect the lifespan manager to your FastAPI app instance
    app = FastAPI(title="Unified Chat API with Local Agent", lifespan=lifespan)
+    logger = logging.getLogger("uvicorn.error")
    # Fixed CORS configuration for CapRover
    app.add_middleware(
@@ -336,6 +338,74 @@ def create_app() -> FastAPI:
        return {"status": "processing_started", "message": "The curriculum is being processed in the background."}
+    @app.post("/mcq/generate")
+    async def generate_mcqs_handler(
+        request: Request,
+        grade: int = Form(...),
+        subject: str = Form(...),
+        unit: str = Form(...),
+        concept: str = Form(...),
+        count: int = Form(5),
+        is_arabic: bool = Form(False)
+    ):
+        """
+        Generates and stores a new set of MCQs for a specific topic.
+        """
+        container = request.app.state.container
+        try:
+            generated_questions = container.agent_service.generate_and_store_mcqs(
+                grade=grade,
+                subject=subject,
+                unit=unit,
+                concept=concept,
+                num_questions=count,
+                is_arabic=is_arabic
+            )
+            return {
+                "status": "success",
+                "message": f"Successfully generated and stored {len(generated_questions)} MCQs.",
+                "questions": generated_questions
+            }
+        except HTTPException as e:
+            raise e # Re-raise FastAPI specific exceptions
+        except Exception as e:
+            logger.error(f"Error in generate_mcqs_handler: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+    @app.get("/mcq")
+    async def get_mcqs_handler(
+        request: Request,
+        grade: int,
+        subject: str,
+        unit: str,
+        concept: str,
+        is_arabic: bool,
+        # Make limit optional. If not provided, it will be None.
+        limit: Optional[int] = None 
+    ):
+        """
+        Retrieves existing MCQs for a specific topic and language from the database.
+        If no limit is provided, retrieves all questions.
+        """
+        container = request.app.state.container
+        try:
+            questions = container.agent_service.pgvector.get_mcqs(
+                grade=grade,
+                subject=subject,
+                unit=unit,
+                concept=concept,
+                is_arabic=is_arabic,
+                limit=limit # Pass the limit (which could be None)
+            )
+            return {
+                "status": "success",
+                "count": len(questions),
+                "questions": questions
+            }
+        except Exception as e:
+            logger.error(f"Error in get_mcqs_handler: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
    @app.options("/get-audio-response")
    async def audio_response_options():
        """Handle preflight CORS requests for audio response endpoint"""

--- a/self_hosted_env/voice_agent/process_pdf_pipline.py
+++ b/self_hosted_env/voice_agent/process_pdf_pipline.py
@@ -19,7 +19,7 @@ import json
 import numpy as np
 from openai import OpenAI
 from typing import List, Dict, Union, Any, Optional, Tuple
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 import google.generativeai as genai
 from google.generativeai import types
 import csv
@@ -27,7 +27,7 @@ import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# You can change level=logging.DEBUG for more verbose output during debugging
+# change level=logging.DEBUG for more verbose output during debugging
 # =========================
 # 1. Initialization and Setup
@@ -432,9 +432,6 @@ Output must be ONLY a valid JSON object (no markdown, no extra text) conforming
        )
        initial_json_string = resp_initial.text.strip()
        initial_data = json.loads(initial_json_string)
-        # Add internal fields
-        initial_data['_page_count'] = pdf_total_pages
-        initial_data['_source_file'] = "N/A" # Placeholder
        book_structure = BookStructure(**initial_data)
        logging.info("✅ Gemini initial structure extraction successful.")
@@ -707,23 +704,54 @@ class EmbeddingProcessor:
            logging.error(f"❌ Error generating embedding for text: '{text[:100]}...' - {str(e)}", exc_info=True)
            return [0.0] * Config.OPENAI_EMBEDDING_DIMENSION
-    def detect_arabic_text(self, text: str) -> bool:
+    def find_context_for_page(self, page_num: int, structured_data: Dict[str, Any]) -> Optional[Dict[str, str]]:
-        """Simple detection of Arabic text based on character ranges."""
+        """
-        if not text or pd.isna(text):
+        Robust hierarchical lookup. It first tries to find a specific Lesson,
-            return False
+        then falls back to finding a Concept, and finally a Unit.
-        text = str(text)
+        """
-        arabic_chars = 0
+        # --- Level 1: Try to find an exact LESSON match ---
-        total_chars = 0
+        for unit_data in structured_data.get("units", []):
-        for char in text:
+            unit_name = unit_data.get("unit_name", "Unknown Unit")
-            if char.strip():
+            for concept_data in unit_data.get("concepts", []):
-                total_chars += 1
+                concept_name = concept_data.get("concept_name", "Unknown Concept")
-                if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or \
+                for lesson_data in concept_data.get("lessons", []):
-                   ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') or \
+                    pages = lesson_data.get("pages")
-                   ('\uFE70' <= char <= '\uFEFF'):
+                    if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
-                    arabic_chars += 1
+                        return {
-        return total_chars > 0 and (arabic_chars / total_chars) > Config.ARABIC_CHAR_THRESHOLD
+                            'Unit': unit_name,
+                            'Concept': concept_name,
-    def process_structured_data_for_embeddings(
+                            'Lesson': lesson_data.get("lesson_name", "Unknown Lesson")
+                        }
+        # --- Level 2: If no lesson matched, try a CONCEPT match ---
+        # This perfectly handles your scenario where Concept ranges are good but Lessons are bad.
+        for unit_data in structured_data.get("units", []):
+            unit_name = unit_data.get("unit_name", "Unknown Unit")
+            for concept_data in unit_data.get("concepts", []):
+                pages = concept_data.get("pages")
+                # We check if the page is within the CONCEPT'S overall range
+                if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
+                    return {
+                        'Unit': unit_name,
+                        'Concept': concept_data.get("concept_name", "Unknown Concept"),
+                        # We label it as general content for this concept since it didn't match a specific lesson
+                        'Lesson': f"General content for {concept_data.get('concept_name', 'Concept')}"
+                    }
+        # --- Level 3: If still nothing, try a UNIT match ---
+        for unit_data in structured_data.get("units", []):
+            pages = unit_data.get("pages")
+            if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
+                 return {
+                    'Unit': unit_data.get("unit_name", "Unknown Unit"),
+                    'Concept': "General Unit Content",
+                    'Lesson': f"General content for {unit_data.get('unit_name', 'Unit')}"
+                }
+        return None # Page completely outside any known structure
+    def process_all_pages_for_embeddings_gapless(
        self,
        structured_data: Dict[str, Any],
        page_texts: Dict[int, str],
@@ -733,75 +761,70 @@ class EmbeddingProcessor:
        output_csv_path: str
    ):
        """
-        Takes Gemini-extracted data and generates embeddings.
+        Processes EVERY page from the OCR output to guarantee 100% content coverage.
-        This version has a more robust repair logic for missing page numbers.
+        It uses the Gemini structure as a lookup to label pages, and applies the last
+        known label to any pages that fall in gaps within the structure.
        """
-        if not structured_data or not structured_data.get("units"):
+        if not page_texts:
-            logging.warning("❌ No structured units found for embedding.")
+            logging.warning("❌ No page texts available to process for embeddings.")
            return
-        logging.info(f"Generating robust embeddings for grade {grade}, {lang} content...")
+        logging.info(f"🚀 Starting GAPLESS embedding generation for Grade {grade}, Subject {subject}.")
        output_rows = []
        is_arabic = (lang == "arabic")
-        total_pages = max(page_texts.keys()) if page_texts else 0
-        last_known_page = 0
+        # Initialize a fallback context for pages that are not in the structure (gaps)
+        last_known_context = {
-        for unit_data in tqdm(structured_data.get("units", []), desc="Processing Units for Embeddings"):
+            'Unit': 'Uncategorized',
-            unit_name = unit_data.get("unit_name", "Unknown Unit")
+            'Concept': 'Uncategorized',
+            'Lesson': 'Uncategorized'
-            for concept_data in unit_data.get("concepts", []):
+        }
-                concept_name = concept_data.get("concept_name", "Unknown Concept")
+        # The main loop iterates through ALL pages, guaranteeing no gaps.
-                for lesson_data in concept_data.get("lessons", []):
+        for page_num in tqdm(sorted(page_texts.keys()), desc="Embedding All Pages"):
-                    lesson_name = lesson_data.get("lesson_name", "Unknown Lesson")
+            page_text = page_texts[page_num]
-                    pages = lesson_data.get("pages")
+            if not page_text.strip():
+                continue # Skip empty pages
-                    start_page, end_page = 0, 0
+            # Find the context for the current page from Gemini's structure
-                    if pages and pages.get('start_page', 0) > 0 and pages.get('end_page', 0) > 0:
+            context = self.find_context_for_page(page_num, structured_data)
-                        # This is the "happy path": Gemini gave us valid pages
-                        start_page, end_page = pages['start_page'], pages['end_page']
+            if context:
-                    else:
+                # If found, this is our new "last known" good context
-                        # --- THIS IS THE NEW, SMARTER REPAIR LOGIC ---
+                last_known_context = context
-                        # Suggest a start page based on the last known page
+                current_context = context
-                        suggested_start = last_known_page + 1
+            else:
+                # If not found (page is in a gap), use the last valid context we saw.
-                        # If our suggestion is already past the end of the book, we can't continue.
+                logging.warning(f"  -> Page {page_num} not in structure. Applying last known context: '{last_known_context['Lesson']}'")
-                        if suggested_start > total_pages:
+                current_context = last_known_context
-                            logging.warning(f"  -> Skipping Lesson '{lesson_name}': Suggested start ({suggested_start}) is beyond total pages ({total_pages}).")
-                            continue # Move to the next lesson
+            # Chunk the text of the single page
+            for chunk_idx, chunk_text in enumerate(self.chunk_text(page_text, is_arabic)):
-                        start_page = suggested_start
+                if not chunk_text:
-                        # Ensure the end page is at least the start page, and not past the end of the book.
+                    continue
-                        end_page = min(start_page + 4, total_pages)
-                        logging.warning(f"  -> Repairing Lesson '{lesson_name}': Applying default pages [{start_page}-{end_page}]")
+                output_rows.append({
+                    'Grade': grade,
-                    # Final safety check, although the logic above should prevent this.
+                    'Subject': subject,
-                    if start_page > end_page:
+                    'Unit': current_context['Unit'],
-                        logging.error(f"  -> CRITICAL SKIP for Lesson '{lesson_name}': Invalid final page range [{start_page}-{end_page}]")
+                    'Concept': current_context['Concept'],
-                        continue
+                    'Lesson': current_context['Lesson'],
+                    'From page': page_num, # Metadata is now per-page
-                    # Update the tracker for the next iteration with a valid page number
+                    'To page': page_num,
-                    last_known_page = end_page
+                    'Chunk index': chunk_idx,
+                    'Chunk text': chunk_text,
-                    lesson_full_text = " ".join([page_texts.get(p, "") for p in range(start_page, end_page + 1)])
+                    'Is Arabic': is_arabic,
+                    'Embedding': json.dumps(self.get_embedding(chunk_text))
-                    for chunk_idx, chunk_text in enumerate(self.chunk_text(lesson_full_text, is_arabic)):
+                })
-                        if not chunk_text: continue
-                        output_rows.append({
-                            'Grade': grade, 'Subject': subject, 'Unit': unit_name, 'Concept': concept_name, 
-                            'Lesson': lesson_name, 'From page': start_page, 'To page': end_page, 
-                            'Chunk index': chunk_idx, 'Chunk text': chunk_text, 'Is Arabic': is_arabic, 
-                            'Embedding': json.dumps(self.get_embedding(chunk_text))
-                        })
        df = pd.DataFrame(output_rows)
        df.to_csv(output_csv_path, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8-sig")
-        logging.info(f"✅ Embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated)")
+        logging.info(f"✅ Gapless embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated from {len(page_texts)} pages)")
-# process_pdf_pipline.py
+# =========================
+# 8. Main Pipeline Function
+# =========================
 def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, total_pages: int) -> Dict[str, Any]:
    logging.warning("🔧 Sanitizing and repairing Gemini's output...")
@@ -831,16 +854,15 @@ def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, tota
    return gemini_output
-# =========================
-# 8. Main Pipeline Function
-# =========================
 def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path: str, output_embeddings_csv_path: str, remove_lessons: bool = False):
    logging.info(f"\n--- Starting Pipeline for {pdf_path} (Grade: {grade}, Subject: {subject}) ---")
-    gemini_raw_output = {}
    try:
-        page_texts, lang,_, tracked_titles = process_pdf_to_text(pdf_path)
+        page_texts, lang, _, tracked_titles = process_pdf_to_text(pdf_path)
-        if not page_texts: return
+        if not page_texts:
+            logging.critical("❌ CRITICAL: No text could be extracted from the PDF. Aborting.")
+            return
        pdf_total_pages = max(page_texts.keys())
        toc_contents = extract_toc_pages_from_first_n(page_texts, lang)
@@ -871,7 +893,9 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
        if openai_client:
            embedding_processor = EmbeddingProcessor(client=openai_client)
-            embedding_processor.process_structured_data_for_embeddings(
+            # --- THIS IS THE FINAL, ROBUST CALL ---
+            embedding_processor.process_all_pages_for_embeddings_gapless(
                gemini_structured_data,
                page_texts,
                lang,
@@ -879,7 +903,8 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
                subject,
                output_embeddings_csv_path
            )
    except Exception as e:
        logging.critical(f"Pipeline error: {e}", exc_info=True)
    logging.info(f"\n--- Pipeline finished for {pdf_path} ---")
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/agent_service.py
+++ b/self_hosted_env/voice_agent/services/agent_service.py
@@ -3,6 +3,7 @@ import os
 from typing import List, Dict, Optional
 from fastapi import HTTPException
 import sys
+import json
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from core import StudentNationality, Models
@@ -94,3 +95,132 @@ class AgentService:
            except Exception as e:
                logger.error(f"Error closing connection pools: {e}")
+    def generate_and_store_mcqs(
+        self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, num_questions: int = 5
+    ) -> List[Dict]:
+        """
+        Generates NEW, UNIQUE MCQs for a topic by first retrieving existing ones
+        and instructing the AI to avoid generating duplicates.
+        """
+        if not self.pgvector:
+            raise HTTPException(status_code=503, detail="Vector service is not available for context retrieval.")
+        # === STEP 1: RETRIEVE EXISTING QUESTIONS ===
+        logger.info(f"Checking for existing questions for: {grade}/{subject}/{unit}/{concept}")
+        existing_questions = self.pgvector.get_mcqs(
+            grade, subject, unit, concept, is_arabic, limit=None # Fetch ALL existing questions
+        )
+        existing_questions_text = "No existing questions found."
+        if existing_questions:
+            # Format the existing questions into a simple list for the prompt
+            q_list = [f"- {q['question_text']}" for q in existing_questions]
+            existing_questions_text = "\n".join(q_list)
+            logger.info(f"Found {len(existing_questions)} existing questions. Will instruct AI to generate different ones.")
+        # === STEP 2: RETRIEVE CURRICULUM CONTEXT ===
+        search_query = f"summary of {concept} in {unit}"
+        query_embedding = self.openai_service.generate_embedding(search_query)
+        context_chunks = self.pgvector.search_filtered_nearest(
+            query_embedding, grade, subject, is_arabic, limit=10 
+        )
+        if not context_chunks:
+            raise HTTPException(status_code=404, detail="No curriculum context found for this topic in the specified language.")
+        full_context = "\n---\n".join([chunk['chunk_text'] for chunk in context_chunks])
+        # === STEP 3: CREATE THE ADVANCED, AWARE PROMPT ===
+        if is_arabic:
+            prompt = f"""
+            أنت خبير في تطوير المناهج ومهمتك إنشاء أسئلة اختيار من متعدد جديدة ومختلفة.
+            هذه هي الأسئلة الموجودة حاليًا في قاعدة البيانات حول المفهوم "{concept}":
+            --- الأسئلة الحالية ---
+            {existing_questions_text}
+            --- نهاية الأسئلة الحالية ---
+            اعتمادًا فقط على السياق التالي من المنهج:
+            --- السياق ---
+            {full_context}
+            --- نهاية السياق ---
+            يرجى توليد {num_questions} من أسئلة الاختيار من متعدد **الجديدة والمختلفة تمامًا** عن الأسئلة الموجودة أعلاه.
+            يجب أن تكون كل الأسئلة قابلة للإجابة مباشرة من السياق المقدم.
+            يجب أن يكون ردك مصفوفة JSON صحيحة. كل كائن يجب أن يحتوي على المفاتيح التالية:
+            - "question_text": نص السؤال.
+            - "correct_answer": الإجابة الصحيحة.
+            - "wrong_answer_1": إجابة خاطئة.
+            - "wrong_answer_2": إجابة خاطئة.
+            - "wrong_answer_3": إجابة خاطئة.
+            لا تكتب أي نص أو شرح خارج مصفوفة الـ JSON.
+            """
+        else:
+            prompt = f"""
+            You are an expert curriculum developer tasked with creating new and unique multiple-choice questions.
+            Here are the questions that ALREADY EXIST in the database for the concept "{concept}":
+            --- EXISTING QUESTIONS ---
+            {existing_questions_text}
+            --- END EXISTING QUESTIONS ---
+            Based ONLY on the following context from the curriculum:
+            --- CONTEXT ---
+            {full_context}
+            --- END CONTEXT ---
+            Please generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions from the list of existing ones above.
+            Each question must be answerable directly from the provided context. The questions and all answers MUST be in English.
+            Your response MUST be a valid JSON array of objects with these keys:
+            - "question_text"
+            - "correct_answer"
+            - "wrong_answer_1"
+            - "wrong_answer_2"
+            - "wrong_answer_3"
+            Do not include any text outside of the JSON array.
+            """
+        # === STEP 4 & 5: CALL LLM, PARSE, and STORE (No changes here) ===
+        try:
+            # ... (The entire try/except block for calling the LLM remains exactly the same)
+            response = self.openai_service.client.chat.completions.create(
+                model=Models.chat,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.5, # Slightly higher temp for more creativity
+                response_format={"type": "json_object"}
+            )
+            response_content = response.choices[0].message.content
+            json_response = json.loads(response_content)
+            generated_questions = []
+            for key, value in json_response.items():
+                if isinstance(value, list):
+                    generated_questions = value
+                    break
+            if not generated_questions:
+                 raise ValueError("LLM did not return a list of questions in the JSON response.")
+        except (json.JSONDecodeError, ValueError, KeyError) as e:
+            logger.error(f"Failed to parse MCQ response from LLM: {e}\nRaw Response: {response_content}")
+            raise HTTPException(status_code=500, detail="Failed to generate or parse MCQs from AI.")
+        mcqs_to_store = []
+        for q in generated_questions:
+            mcqs_to_store.append({
+                "grade": grade, "is_arabic": is_arabic, "subject": subject,
+                "unit": unit, "concept": concept, "question_text": q["question_text"],
+                "correct_answer": q["correct_answer"], "wrong_answer_1": q["wrong_answer_1"],
+                "wrong_answer_2": q["wrong_answer_2"], "wrong_answer_3": q["wrong_answer_3"],
+            })
+        self.pgvector.insert_mcqs(mcqs_to_store)
+        return mcqs_to_store
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/pgvector_service.py
+++ b/self_hosted_env/voice_agent/services/pgvector_service.py
@@ -523,4 +523,60 @@ class PGVectorService:
            print("="*50)
        except Exception as e:
            print(f"❌ Database verification failed: {e}")
\ No newline at end of file
+    def insert_mcqs(self, mcq_list: List[Dict]):
+        """
+        Inserts a batch of MCQs, now including the language flag.
+        """
+        if not mcq_list:
+            return
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor() as cur:
+                # --- UPDATED QUERY ---
+                insert_query = """
+                    INSERT INTO mcq_questions (
+                        grade, is_arabic, subject, unit, concept, question_text, 
+                        correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
+                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                """
+                # --- UPDATED DATA PREPARATION ---
+                data_to_insert = [
+                    (
+                        q['grade'], q['is_arabic'], q['subject'], q['unit'], q['concept'], 
+                        q['question_text'], q['correct_answer'], q['wrong_answer_1'], 
+                        q['wrong_answer_2'], q['wrong_answer_3']
+                    ) for q in mcq_list
+                ]
+                cur.executemany(insert_query, data_to_insert)
+                conn.commit()
+                logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")
+    def get_mcqs(self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
+        """
+        Retrieves MCQs for a specific topic and language.
+        If limit is None, it retrieves all matching questions.
+        """
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor(cursor_factory=RealDictCursor) as cur:
+                # Dynamically build the query based on the limit
+                query = """
+                    SELECT id, question_text, correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
+                    FROM mcq_questions
+                    WHERE grade = %s AND subject = %s AND unit = %s AND concept = %s AND is_arabic = %s
+                    ORDER BY created_at DESC
+                """
+                params = (grade, subject, unit, concept, is_arabic)
+                if limit is not None:
+                    query += " LIMIT %s;"
+                    params += (limit,)
+                else:
+                    query += ";"
+                cur.execute(query, params)
+                return cur.fetchall()
\ No newline at end of file
--- a/self_hosted_env/voice_agent/setup_mcq_table.py
+++ b/self_hosted_env/voice_agent/setup_mcq_table.py
+import psycopg2
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def setup_mcq_table(drop_existing_table: bool = False):
+    """
+    Sets up the mcq_questions table in the database.
+    """
+    try:
+        conn = psycopg2.connect(
+            host=os.getenv("POSTGRES_HOST", "localhost"),
+            port=os.getenv("POSTGRES_PORT", "5432"),
+            user=os.getenv("POSTGRES_USER"),
+            password=os.getenv("POSTGRES_PASSWORD"),
+            dbname=os.getenv("POSTGRES_DB")
+        )
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            if drop_existing_table:
+                print("Dropping existing mcq_questions table...")
+                cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
+                print("Table dropped.")
+            print("Creating mcq_questions table...")
+            # --- THIS IS THE UPDATED TABLE SCHEMA ---
+            cur.execute("""
+                CREATE TABLE IF NOT EXISTS mcq_questions (
+                    id SERIAL PRIMARY KEY,
+                    grade INTEGER NOT NULL,
+                    is_arabic BOOLEAN NOT NULL, -- <-- ADDED THIS LINE
+                    subject TEXT NOT NULL,
+                    unit TEXT NOT NULL,
+                    concept TEXT NOT NULL,
+                    question_text TEXT NOT NULL,
+                    correct_answer TEXT NOT NULL,
+                    wrong_answer_1 TEXT NOT NULL,
+                    wrong_answer_2 TEXT NOT NULL,
+                    wrong_answer_3 TEXT NOT NULL,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                );
+            """)
+            print("Creating indexes on mcq_questions table...")
+            # --- THIS IS THE UPDATED INDEX ---
+            cur.execute("""
+                CREATE INDEX IF NOT EXISTS idx_mcq_topic 
+                ON mcq_questions(grade, is_arabic, subject, unit, concept); -- <-- ADDED is_arabic
+            """)
+            print("MCQ table setup complete.")
+    except Exception as e:
+        print(f"An error occurred during MCQ table setup: {e}")
+    finally:
+        if 'conn' in locals() and conn:
+            conn.close()
+            print("Database connection closed.")
+if __name__ == "__main__":
+    # To apply the changes, it's best to drop and recreate the table.
+    # Be careful if you have existing data you want to keep!
+    print("Creating MCQ table...")
+    setup_mcq_table(drop_existing_table=False)
\ No newline at end of file
--- a/self_hosted_env/voice_agent/start.sh
+++ b/self_hosted_env/voice_agent/start.sh
@@ -10,6 +10,8 @@ python insert_csv_embeddings.py
 echo "Database setup complete."
 python curriculum_structure.py
 echo "Curriculum structure setup complete."
+python setup_mcq_table.py
+echo "MCQ table setup complete."
 sleep 5
 # Start the web server and keep it as the main process

--- a/test_cases/msq_test.py
+++ b/test_cases/msq_test.py
+"""
+======================================================================
+MCQ API Cookbook & Test Script
+======================================================================
+Purpose:
+This script serves as both a live integration test and a practical guide ("cookbook") 
+for using the Multiple-Choice Question (MCQ) generation and retrieval API endpoints.
+It demonstrates how to:
+  1. Generate and store new MCQs for a specific curriculum topic.
+  2. Retrieve existing MCQs from the database for that same topic.
+----------------------------------------------------------------------
+API Endpoints Guide
+----------------------------------------------------------------------
+There are two main endpoints for the MCQ feature:
+1. Generate Questions (POST /mcq/generate)
+------------------------------------------
+This is the "creator" endpoint. It uses an AI model to generate a new set of questions 
+based on the curriculum content stored in the vector database. It then saves these 
+new questions to the `mcq_questions` table for future use.
+- Method: POST
+- URL: [BASE_URL]/mcq/generate
+- Data Format: Must be sent as `application/x-www-form-urlencoded` (form data).
+Parameters (Form Data):
+  - grade (int, required): The grade level of the curriculum (e.g., 4).
+  - subject (str, required): The subject of the curriculum (e.g., "Science").
+  - unit (str, required): The exact name of the unit.
+  - concept (str, required): The exact name of the concept.
+  - is_arabic (bool, required): Set to `true` for Arabic curriculum, `false` for English.
+  - count (int, optional, default=5): The number of new questions to generate.
+Example Usage (using cURL):
+  curl -X POST [BASE_URL]/mcq/generate \
+  -F "grade=4" \
+  -F "subject=Science" \
+  -F "unit=الوحدة الأولى: الأنظمة الحية" \
+  -F "concept=المفهوم الأول: التكيف والبقاء" \
+  -F "is_arabic=true" \
+  -F "count=3"
+2. Retrieve Questions (GET /mcq)
+---------------------------------
+This is the "reader" endpoint. It quickly and cheaply retrieves questions that have 
+already been generated and stored in the database. It does NOT call the AI model.
+- Method: GET
+- URL: [BASE_URL]/mcq
+Parameters (URL Query Parameters):
+  - grade (int, required): The grade level.
+  - subject (str, required): The subject.
+  - unit (str, required): The unit name.
+  - concept (str, required): The concept name.
+  - is_arabic (bool, required): `true` for Arabic, `false` for English.
+  - limit (int, optional, default=None): The maximum number of questions to retrieve. 
+                                         If omitted, it will retrieve ALL questions for that topic.
+Example Usage (using cURL):
+  # Get the 5 most recent questions for a topic
+  curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true&limit=5"
+  # Get ALL questions for a topic
+  curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true"
+----------------------------------------------------------------------
+How to Run This Script
+----------------------------------------------------------------------
+1. Ensure your FastAPI server is running.
+2. Make sure the BASE_URL variable below is set to your server's address.
+3. Run the script from your terminal: python3 msq_test.py
+"""
+import requests
+import json
+import time
+from typing import Optional
+# The base URL of your API server.
+BASE_URL = "https://voice-agent.caprover.al-arcade.com"
+def test_mcq_generation(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, count: int):
+    """
+    Tests the POST /mcq/generate endpoint.
+    """
+    endpoint = f"{BASE_URL}/mcq/generate"
+    payload = {
+        "grade": grade,
+        "subject": subject,
+        "unit": unit,
+        "concept": concept,
+        "is_arabic": is_arabic,
+        "count": count,
+    }
+    print(f">> Attempting to GENERATE {count} new questions for:")
+    print(f"   Topic: Grade {grade} {subject} -> {unit} -> {concept}")
+    print(f"   Language: {'Arabic' if is_arabic else 'English'}")
+    try:
+        response = requests.post(endpoint, data=payload, timeout=120) 
+        if response.status_code == 200:
+            print(f"SUCCESS: API returned status code {response.status_code}")
+            data = response.json()
+            print(f"   Message: {data.get('message')}")
+            if 'questions' in data and data['questions']:
+                print("\n   --- Details of Generated Questions ---")
+                for i, q in enumerate(data['questions'], 1):
+                    print(f"   {i}. Question: {q['question_text']}")
+                    print(f"      Correct:   {q['correct_answer']}")
+                    print(f"      Wrong 1:   {q['wrong_answer_1']}")
+                    print(f"      Wrong 2:   {q['wrong_answer_2']}")
+                    print(f"      Wrong 3:   {q['wrong_answer_3']}\n")
+            return True
+        else:
+            print(f"FAILED: API returned status code {response.status_code}")
+            try:
+                error_data = response.json()
+                print(f"   Error Detail: {error_data.get('detail', 'No detail provided.')}")
+            except json.JSONDecodeError:
+                print(f"   Response was not valid JSON: {response.text}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"FAILED: An error occurred while making the request: {e}")
+        return False
+def test_mcq_retrieval(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int]):
+    """
+    Tests the GET /mcq endpoint with detailed output.
+    """
+    endpoint = f"{BASE_URL}/mcq"
+    params = {
+        "grade": grade,
+        "subject": subject,
+        "unit": unit,
+        "concept": concept,
+        "is_arabic": is_arabic,
+    }
+    if limit is not None:
+        params["limit"] = limit
+    limit_str = f"up to {limit}" if limit is not None else "ALL"
+    print(f">> Attempting to RETRIEVE {limit_str} stored questions for the same topic...")
+    try:
+        response = requests.get(endpoint, params=params, timeout=30)
+        if response.status_code == 200:
+            print(f"SUCCESS: API returned status code {response.status_code}")
+            data = response.json()
+            print(f"   Found {data.get('count')} stored questions in the database.")
+            if 'questions' in data and data['questions']:
+                print("\n   --- Details of Retrieved Questions ---")
+                for i, q in enumerate(data['questions'], 1):
+                    print(f"   {i}. Question: {q['question_text']}")
+                    print(f"      Correct:   {q['correct_answer']}")
+                    print(f"      Wrong 1:   {q['wrong_answer_1']}")
+                    print(f"      Wrong 2:   {q['wrong_answer_2']}")
+                    print(f"      Wrong 3:   {q['wrong_answer_3']}\n")
+            elif data.get('count') == 0:
+                print("   (This is expected if this is the first time generating questions for this topic)")
+            return True
+        else:
+            print(f"FAILED: API returned status code {response.status_code}")
+            try:
+                error_data = response.json()
+                print(f"   Error Detail: {error_data.get('detail', 'No detail provided.')}")
+            except json.JSONDecodeError:
+                print(f"   Response was not valid JSON: {response.text}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"FAILED: An error occurred while making the request: {e}")
+        return False
+if __name__ == "__main__":
+    print("\n" + "="*50)
+    print("STARTING TEST 1: ARABIC MCQ GENERATION & RETRIEVAL")
+    print("="*50)
+    # IMPORTANT: Use actual Unit/Concept names from your database for the best results.
+    arabic_test_data = {
+        "grade": 4,
+        "subject": "Science",
+        "unit": "الوحدة الأولى: الأنظمة الحية",
+        "concept": "المفهوم الأول: التكيف والبقاء",
+        "is_arabic": True,
+        "count": 3
+    }
+    generation_successful = test_mcq_generation(**arabic_test_data)
+    if generation_successful:
+        print("-" * 25)
+        time.sleep(2)
+        test_mcq_retrieval(limit=None, **{k:v for k,v in arabic_test_data.items() if k != 'count'})
+    print("\n" + "="*50)
+    print("STARTING TEST 2: ENGLISH MCQ GENERATION & RETRIEVAL")
+    print("="*50)
+    english_test_data = {
+        "grade": 5,
+        "subject": "Science",
+        "unit": "Unit 1: Matter and Energy in Ecosystems",
+        "concept": "Concept 1.1: Properties of Matter",
+        "is_arabic": False,
+        "count": 2
+    }
+    generation_successful = test_mcq_generation(**english_test_data)
+    if generation_successful:
+        print("-" * 25)
+        time.sleep(2)
+        test_mcq_retrieval(limit=None, **{k:v for k,v in english_test_data.items() if k != 'count'})
+    print("\n" + "="*50)
+    print("All tests complete.")
+    print("="*50)
\ No newline at end of file