mcq AI builder

6c10aac1 · SalmaMohammedHamedMustafa · 8d42c50c · 6c10aac1 · 6c10aac1 · 6c10aac1
Commit 6c10aac1 authored Oct 26, 2025 by SalmaMohammedHamedMustafa
7 changed files
--- a/self_hosted_env/voice_agent/main.py
+++ b/self_hosted_env/voice_agent/main.py
 import os
 import shutil
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks, logger
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, Response
 from fastapi.staticfiles import StaticFiles
@@ -12,6 +12,7 @@ from pathlib import Path
 import tempfile
 import json
 import pandas as pd
+import logging
 from curriculum_structure import convert_json_to_db_format
 from process_pdf_pipline import run_full_pipeline

@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI):
 def create_app() -> FastAPI:
    # Connect the lifespan manager to your FastAPI app instance
    app = FastAPI(title="Unified Chat API with Local Agent", lifespan=lifespan)
+    logger = logging.getLogger("uvicorn.error")
    
    # Fixed CORS configuration for CapRover
    app.add_middleware(
@@ -336,6 +338,74 @@ def create_app() -> FastAPI:
        return {"status": "processing_started", "message": "The curriculum is being processed in the background."}


+    @app.post("/mcq/generate")
+    async def generate_mcqs_handler(
+        request: Request,
+        grade: int = Form(...),
+        subject: str = Form(...),
+        unit: str = Form(...),
+        concept: str = Form(...),
+        count: int = Form(5),
+        is_arabic: bool = Form(False)
+    ):
+        """
+        Generates and stores a new set of MCQs for a specific topic.
+        """
+        container = request.app.state.container
+        try:
+            generated_questions = container.agent_service.generate_and_store_mcqs(
+                grade=grade,
+                subject=subject,
+                unit=unit,
+                concept=concept,
+                num_questions=count,
+                is_arabic=is_arabic
+            )
+            return {
+                "status": "success",
+                "message": f"Successfully generated and stored {len(generated_questions)} MCQs.",
+                "questions": generated_questions
+            }
+        except HTTPException as e:
+            raise e # Re-raise FastAPI specific exceptions
+        except Exception as e:
+            logger.error(f"Error in generate_mcqs_handler: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+
+    @app.get("/mcq")
+    async def get_mcqs_handler(
+        request: Request,
+        grade: int,
+        subject: str,
+        unit: str,
+        concept: str,
+        is_arabic: bool,
+        # Make limit optional. If not provided, it will be None.
+        limit: Optional[int] = None 
+    ):
+        """
+        Retrieves existing MCQs for a specific topic and language from the database.
+        If no limit is provided, retrieves all questions.
+        """
+        container = request.app.state.container
+        try:
+            questions = container.agent_service.pgvector.get_mcqs(
+                grade=grade,
+                subject=subject,
+                unit=unit,
+                concept=concept,
+                is_arabic=is_arabic,
+                limit=limit # Pass the limit (which could be None)
+            )
+            return {
+                "status": "success",
+                "count": len(questions),
+                "questions": questions
+            }
+        except Exception as e:
+            logger.error(f"Error in get_mcqs_handler: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+            
    @app.options("/get-audio-response")
    async def audio_response_options():
        """Handle preflight CORS requests for audio response endpoint"""

--- a/self_hosted_env/voice_agent/process_pdf_pipline.py
+++ b/self_hosted_env/voice_agent/process_pdf_pipline.py
--- a/self_hosted_env/voice_agent/services/agent_service.py
+++ b/self_hosted_env/voice_agent/services/agent_service.py
@@ -3,6 +3,7 @@ import os
 from typing import List, Dict, Optional
 from fastapi import HTTPException
 import sys
+import json
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

 from core import StudentNationality, Models
@@ -94,3 +95,132 @@ class AgentService:
            except Exception as e:
                logger.error(f"Error closing connection pools: {e}")

+
+    def generate_and_store_mcqs(
+        self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, num_questions: int = 5
+    ) -> List[Dict]:
+        """
+        Generates NEW, UNIQUE MCQs for a topic by first retrieving existing ones
+        and instructing the AI to avoid generating duplicates.
+        """
+        if not self.pgvector:
+            raise HTTPException(status_code=503, detail="Vector service is not available for context retrieval.")
+
+        # === STEP 1: RETRIEVE EXISTING QUESTIONS ===
+        logger.info(f"Checking for existing questions for: {grade}/{subject}/{unit}/{concept}")
+        existing_questions = self.pgvector.get_mcqs(
+            grade, subject, unit, concept, is_arabic, limit=None # Fetch ALL existing questions
+        )
+        
+        existing_questions_text = "No existing questions found."
+        if existing_questions:
+            # Format the existing questions into a simple list for the prompt
+            q_list = [f"- {q['question_text']}" for q in existing_questions]
+            existing_questions_text = "\n".join(q_list)
+            logger.info(f"Found {len(existing_questions)} existing questions. Will instruct AI to generate different ones.")
+
+        # === STEP 2: RETRIEVE CURRICULUM CONTEXT ===
+        search_query = f"summary of {concept} in {unit}"
+        query_embedding = self.openai_service.generate_embedding(search_query)
+        
+        context_chunks = self.pgvector.search_filtered_nearest(
+            query_embedding, grade, subject, is_arabic, limit=10 
+        )
+        
+        if not context_chunks:
+            raise HTTPException(status_code=404, detail="No curriculum context found for this topic in the specified language.")
+            
+        full_context = "\n---\n".join([chunk['chunk_text'] for chunk in context_chunks])
+        
+
+        # === STEP 3: CREATE THE ADVANCED, AWARE PROMPT ===
+        if is_arabic:
+            prompt = f"""
+            أنت خبير في تطوير المناهج ومهمتك إنشاء أسئلة اختيار من متعدد جديدة ومختلفة.
+            
+            هذه هي الأسئلة الموجودة حاليًا في قاعدة البيانات حول المفهوم "{concept}":
+            --- الأسئلة الحالية ---
+            {existing_questions_text}
+            --- نهاية الأسئلة الحالية ---
+
+            اعتمادًا فقط على السياق التالي من المنهج:
+            --- السياق ---
+            {full_context}
+            --- نهاية السياق ---
+            
+            يرجى توليد {num_questions} من أسئلة الاختيار من متعدد **الجديدة والمختلفة تمامًا** عن الأسئلة الموجودة أعلاه.
+            يجب أن تكون كل الأسئلة قابلة للإجابة مباشرة من السياق المقدم.
+            
+            يجب أن يكون ردك مصفوفة JSON صحيحة. كل كائن يجب أن يحتوي على المفاتيح التالية:
+            - "question_text": نص السؤال.
+            - "correct_answer": الإجابة الصحيحة.
+            - "wrong_answer_1": إجابة خاطئة.
+            - "wrong_answer_2": إجابة خاطئة.
+            - "wrong_answer_3": إجابة خاطئة.
+            
+            لا تكتب أي نص أو شرح خارج مصفوفة الـ JSON.
+            """
+        else:
+            prompt = f"""
+            You are an expert curriculum developer tasked with creating new and unique multiple-choice questions.
+
+            Here are the questions that ALREADY EXIST in the database for the concept "{concept}":
+            --- EXISTING QUESTIONS ---
+            {existing_questions_text}
+            --- END EXISTING QUESTIONS ---
+
+            Based ONLY on the following context from the curriculum:
+            --- CONTEXT ---
+            {full_context}
+            --- END CONTEXT ---
+            
+            Please generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions from the list of existing ones above.
+            Each question must be answerable directly from the provided context. The questions and all answers MUST be in English.
+            
+            Your response MUST be a valid JSON array of objects with these keys:
+            - "question_text"
+            - "correct_answer"
+            - "wrong_answer_1"
+            - "wrong_answer_2"
+            - "wrong_answer_3"
+            
+            Do not include any text outside of the JSON array.
+            """
+
+        # === STEP 4 & 5: CALL LLM, PARSE, and STORE (No changes here) ===
+        try:
+            # ... (The entire try/except block for calling the LLM remains exactly the same)
+            response = self.openai_service.client.chat.completions.create(
+                model=Models.chat,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.5, # Slightly higher temp for more creativity
+                response_format={"type": "json_object"}
+            )
+            response_content = response.choices[0].message.content
+            json_response = json.loads(response_content)
+            
+            generated_questions = []
+            for key, value in json_response.items():
+                if isinstance(value, list):
+                    generated_questions = value
+                    break
+            
+            if not generated_questions:
+                 raise ValueError("LLM did not return a list of questions in the JSON response.")
+
+        except (json.JSONDecodeError, ValueError, KeyError) as e:
+            logger.error(f"Failed to parse MCQ response from LLM: {e}\nRaw Response: {response_content}")
+            raise HTTPException(status_code=500, detail="Failed to generate or parse MCQs from AI.")
+
+        mcqs_to_store = []
+        for q in generated_questions:
+            mcqs_to_store.append({
+                "grade": grade, "is_arabic": is_arabic, "subject": subject,
+                "unit": unit, "concept": concept, "question_text": q["question_text"],
+                "correct_answer": q["correct_answer"], "wrong_answer_1": q["wrong_answer_1"],
+                "wrong_answer_2": q["wrong_answer_2"], "wrong_answer_3": q["wrong_answer_3"],
+            })
+        
+        self.pgvector.insert_mcqs(mcqs_to_store)
+        
+        return mcqs_to_store
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/pgvector_service.py
+++ b/self_hosted_env/voice_agent/services/pgvector_service.py
@@ -524,3 +524,59 @@ class PGVectorService:

        except Exception as e:
            print(f"❌ Database verification failed: {e}")
+
+    def insert_mcqs(self, mcq_list: List[Dict]):
+        """
+        Inserts a batch of MCQs, now including the language flag.
+        """
+        if not mcq_list:
+            return
+
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor() as cur:
+                # --- UPDATED QUERY ---
+                insert_query = """
+                    INSERT INTO mcq_questions (
+                        grade, is_arabic, subject, unit, concept, question_text, 
+                        correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
+                    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                """
+                # --- UPDATED DATA PREPARATION ---
+                data_to_insert = [
+                    (
+                        q['grade'], q['is_arabic'], q['subject'], q['unit'], q['concept'], 
+                        q['question_text'], q['correct_answer'], q['wrong_answer_1'], 
+                        q['wrong_answer_2'], q['wrong_answer_3']
+                    ) for q in mcq_list
+                ]
+                
+                cur.executemany(insert_query, data_to_insert)
+                conn.commit()
+                logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")
+
+    def get_mcqs(self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
+        """
+        Retrieves MCQs for a specific topic and language.
+        If limit is None, it retrieves all matching questions.
+        """
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor(cursor_factory=RealDictCursor) as cur:
+                
+                # Dynamically build the query based on the limit
+                query = """
+                    SELECT id, question_text, correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
+                    FROM mcq_questions
+                    WHERE grade = %s AND subject = %s AND unit = %s AND concept = %s AND is_arabic = %s
+                    ORDER BY created_at DESC
+                """
+                
+                params = (grade, subject, unit, concept, is_arabic)
+                
+                if limit is not None:
+                    query += " LIMIT %s;"
+                    params += (limit,)
+                else:
+                    query += ";"
+
+                cur.execute(query, params)
+                return cur.fetchall()
\ No newline at end of file
--- a/self_hosted_env/voice_agent/setup_mcq_table.py
+++ b/self_hosted_env/voice_agent/setup_mcq_table.py
+import psycopg2
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def setup_mcq_table(drop_existing_table: bool = False):
+    """
+    Sets up the mcq_questions table in the database.
+    """
+    try:
+        conn = psycopg2.connect(
+            host=os.getenv("POSTGRES_HOST", "localhost"),
+            port=os.getenv("POSTGRES_PORT", "5432"),
+            user=os.getenv("POSTGRES_USER"),
+            password=os.getenv("POSTGRES_PASSWORD"),
+            dbname=os.getenv("POSTGRES_DB")
+        )
+        conn.autocommit = True
+        
+        with conn.cursor() as cur:
+            if drop_existing_table:
+                print("Dropping existing mcq_questions table...")
+                cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
+                print("Table dropped.")
+
+            print("Creating mcq_questions table...")
+            # --- THIS IS THE UPDATED TABLE SCHEMA ---
+            cur.execute("""
+                CREATE TABLE IF NOT EXISTS mcq_questions (
+                    id SERIAL PRIMARY KEY,
+                    grade INTEGER NOT NULL,
+                    is_arabic BOOLEAN NOT NULL, -- <-- ADDED THIS LINE
+                    subject TEXT NOT NULL,
+                    unit TEXT NOT NULL,
+                    concept TEXT NOT NULL,
+                    question_text TEXT NOT NULL,
+                    correct_answer TEXT NOT NULL,
+                    wrong_answer_1 TEXT NOT NULL,
+                    wrong_answer_2 TEXT NOT NULL,
+                    wrong_answer_3 TEXT NOT NULL,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                );
+            """)
+            
+            print("Creating indexes on mcq_questions table...")
+            # --- THIS IS THE UPDATED INDEX ---
+            cur.execute("""
+                CREATE INDEX IF NOT EXISTS idx_mcq_topic 
+                ON mcq_questions(grade, is_arabic, subject, unit, concept); -- <-- ADDED is_arabic
+            """)
+
+            print("MCQ table setup complete.")
+                
+    except Exception as e:
+        print(f"An error occurred during MCQ table setup: {e}")
+    finally:
+        if 'conn' in locals() and conn:
+            conn.close()
+            print("Database connection closed.")
+
+if __name__ == "__main__":
+    # To apply the changes, it's best to drop and recreate the table.
+    # Be careful if you have existing data you want to keep!
+    print("Creating MCQ table...")
+    setup_mcq_table(drop_existing_table=False)
\ No newline at end of file
--- a/self_hosted_env/voice_agent/start.sh
+++ b/self_hosted_env/voice_agent/start.sh
@@ -10,6 +10,8 @@ python insert_csv_embeddings.py
 echo "Database setup complete."
 python curriculum_structure.py
 echo "Curriculum structure setup complete."
+python setup_mcq_table.py
+echo "MCQ table setup complete."

 sleep 5
 # Start the web server and keep it as the main process

--- a/test_cases/msq_test.py
+++ b/test_cases/msq_test.py
+"""
+======================================================================
+MCQ API Cookbook & Test Script
+======================================================================
+
+Purpose:
+This script serves as both a live integration test and a practical guide ("cookbook") 
+for using the Multiple-Choice Question (MCQ) generation and retrieval API endpoints.
+
+It demonstrates how to:
+  1. Generate and store new MCQs for a specific curriculum topic.
+  2. Retrieve existing MCQs from the database for that same topic.
+
+----------------------------------------------------------------------
+API Endpoints Guide
+----------------------------------------------------------------------
+
+There are two main endpoints for the MCQ feature:
+
+1. Generate Questions (POST /mcq/generate)
+------------------------------------------
+This is the "creator" endpoint. It uses an AI model to generate a new set of questions 
+based on the curriculum content stored in the vector database. It then saves these 
+new questions to the `mcq_questions` table for future use.
+
+- Method: POST
+- URL: [BASE_URL]/mcq/generate
+- Data Format: Must be sent as `application/x-www-form-urlencoded` (form data).
+
+Parameters (Form Data):
+  - grade (int, required): The grade level of the curriculum (e.g., 4).
+  - subject (str, required): The subject of the curriculum (e.g., "Science").
+  - unit (str, required): The exact name of the unit.
+  - concept (str, required): The exact name of the concept.
+  - is_arabic (bool, required): Set to `true` for Arabic curriculum, `false` for English.
+  - count (int, optional, default=5): The number of new questions to generate.
+
+Example Usage (using cURL):
+  curl -X POST [BASE_URL]/mcq/generate \
+  -F "grade=4" \
+  -F "subject=Science" \
+  -F "unit=الوحدة الأولى: الأنظمة الحية" \
+  -F "concept=المفهوم الأول: التكيف والبقاء" \
+  -F "is_arabic=true" \
+  -F "count=3"
+
+
+2. Retrieve Questions (GET /mcq)
+---------------------------------
+This is the "reader" endpoint. It quickly and cheaply retrieves questions that have 
+already been generated and stored in the database. It does NOT call the AI model.
+
+- Method: GET
+- URL: [BASE_URL]/mcq
+
+Parameters (URL Query Parameters):
+  - grade (int, required): The grade level.
+  - subject (str, required): The subject.
+  - unit (str, required): The unit name.
+  - concept (str, required): The concept name.
+  - is_arabic (bool, required): `true` for Arabic, `false` for English.
+  - limit (int, optional, default=None): The maximum number of questions to retrieve. 
+                                         If omitted, it will retrieve ALL questions for that topic.
+
+Example Usage (using cURL):
+  # Get the 5 most recent questions for a topic
+  curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true&limit=5"
+
+  # Get ALL questions for a topic
+  curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true"
+
+
+----------------------------------------------------------------------
+How to Run This Script
+----------------------------------------------------------------------
+1. Ensure your FastAPI server is running.
+2. Make sure the BASE_URL variable below is set to your server's address.
+3. Run the script from your terminal: python3 msq_test.py
+"""
+
+import requests
+import json
+import time
+from typing import Optional
+
+# The base URL of your API server.
+BASE_URL = "https://voice-agent.caprover.al-arcade.com"
+
+def test_mcq_generation(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, count: int):
+    """
+    Tests the POST /mcq/generate endpoint.
+    """
+    endpoint = f"{BASE_URL}/mcq/generate"
+    
+    payload = {
+        "grade": grade,
+        "subject": subject,
+        "unit": unit,
+        "concept": concept,
+        "is_arabic": is_arabic,
+        "count": count,
+    }
+
+    print(f">> Attempting to GENERATE {count} new questions for:")
+    print(f"   Topic: Grade {grade} {subject} -> {unit} -> {concept}")
+    print(f"   Language: {'Arabic' if is_arabic else 'English'}")
+    
+    try:
+        response = requests.post(endpoint, data=payload, timeout=120) 
+        
+        if response.status_code == 200:
+            print(f"SUCCESS: API returned status code {response.status_code}")
+            data = response.json()
+            print(f"   Message: {data.get('message')}")
+            
+            if 'questions' in data and data['questions']:
+                print("\n   --- Details of Generated Questions ---")
+                for i, q in enumerate(data['questions'], 1):
+                    print(f"   {i}. Question: {q['question_text']}")
+                    print(f"      Correct:   {q['correct_answer']}")
+                    print(f"      Wrong 1:   {q['wrong_answer_1']}")
+                    print(f"      Wrong 2:   {q['wrong_answer_2']}")
+                    print(f"      Wrong 3:   {q['wrong_answer_3']}\n")
+            return True
+        else:
+            print(f"FAILED: API returned status code {response.status_code}")
+            try:
+                error_data = response.json()
+                print(f"   Error Detail: {error_data.get('detail', 'No detail provided.')}")
+            except json.JSONDecodeError:
+                print(f"   Response was not valid JSON: {response.text}")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"FAILED: An error occurred while making the request: {e}")
+        return False
+
+
+def test_mcq_retrieval(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int]):
+    """
+    Tests the GET /mcq endpoint with detailed output.
+    """
+    endpoint = f"{BASE_URL}/mcq"
+    
+    params = {
+        "grade": grade,
+        "subject": subject,
+        "unit": unit,
+        "concept": concept,
+        "is_arabic": is_arabic,
+    }
+    if limit is not None:
+        params["limit"] = limit
+    
+    limit_str = f"up to {limit}" if limit is not None else "ALL"
+    print(f">> Attempting to RETRIEVE {limit_str} stored questions for the same topic...")
+
+    try:
+        response = requests.get(endpoint, params=params, timeout=30)
+        
+        if response.status_code == 200:
+            print(f"SUCCESS: API returned status code {response.status_code}")
+            data = response.json()
+            print(f"   Found {data.get('count')} stored questions in the database.")
+            
+            if 'questions' in data and data['questions']:
+                print("\n   --- Details of Retrieved Questions ---")
+                for i, q in enumerate(data['questions'], 1):
+                    print(f"   {i}. Question: {q['question_text']}")
+                    print(f"      Correct:   {q['correct_answer']}")
+                    print(f"      Wrong 1:   {q['wrong_answer_1']}")
+                    print(f"      Wrong 2:   {q['wrong_answer_2']}")
+                    print(f"      Wrong 3:   {q['wrong_answer_3']}\n")
+            elif data.get('count') == 0:
+                print("   (This is expected if this is the first time generating questions for this topic)")
+            return True
+        else:
+            print(f"FAILED: API returned status code {response.status_code}")
+            try:
+                error_data = response.json()
+                print(f"   Error Detail: {error_data.get('detail', 'No detail provided.')}")
+            except json.JSONDecodeError:
+                print(f"   Response was not valid JSON: {response.text}")
+            return False
+
+    except requests.exceptions.RequestException as e:
+        print(f"FAILED: An error occurred while making the request: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    print("\n" + "="*50)
+    print("STARTING TEST 1: ARABIC MCQ GENERATION & RETRIEVAL")
+    print("="*50)
+    
+    # IMPORTANT: Use actual Unit/Concept names from your database for the best results.
+    arabic_test_data = {
+        "grade": 4,
+        "subject": "Science",
+        "unit": "الوحدة الأولى: الأنظمة الحية",
+        "concept": "المفهوم الأول: التكيف والبقاء",
+        "is_arabic": True,
+        "count": 3
+    }
+    
+    generation_successful = test_mcq_generation(**arabic_test_data)
+    
+    if generation_successful:
+        print("-" * 25)
+        time.sleep(2)
+        test_mcq_retrieval(limit=None, **{k:v for k,v in arabic_test_data.items() if k != 'count'})
+
+    print("\n" + "="*50)
+    print("STARTING TEST 2: ENGLISH MCQ GENERATION & RETRIEVAL")
+    print("="*50)
+    
+    english_test_data = {
+        "grade": 5,
+        "subject": "Science",
+        "unit": "Unit 1: Matter and Energy in Ecosystems",
+        "concept": "Concept 1.1: Properties of Matter",
+        "is_arabic": False,
+        "count": 2
+    }
+    
+    generation_successful = test_mcq_generation(**english_test_data)
+    
+    if generation_successful:
+        print("-" * 25)
+        time.sleep(2)
+        test_mcq_retrieval(limit=None, **{k:v for k,v in english_test_data.items() if k != 'count'})
+
+    print("\n" + "="*50)
+    print("All tests complete.")
+    print("="*50)
\ No newline at end of file