mcq AI builder

parent 8d42c50c
import os import os
import shutil import shutil
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request, BackgroundTasks, logger
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, Response from fastapi.responses import FileResponse, Response
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
...@@ -12,6 +12,7 @@ from pathlib import Path ...@@ -12,6 +12,7 @@ from pathlib import Path
import tempfile import tempfile
import json import json
import pandas as pd import pandas as pd
import logging
from curriculum_structure import convert_json_to_db_format from curriculum_structure import convert_json_to_db_format
from process_pdf_pipline import run_full_pipeline from process_pdf_pipline import run_full_pipeline
...@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI): ...@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI):
def create_app() -> FastAPI: def create_app() -> FastAPI:
# Connect the lifespan manager to your FastAPI app instance # Connect the lifespan manager to your FastAPI app instance
app = FastAPI(title="Unified Chat API with Local Agent", lifespan=lifespan) app = FastAPI(title="Unified Chat API with Local Agent", lifespan=lifespan)
logger = logging.getLogger("uvicorn.error")
# Fixed CORS configuration for CapRover # Fixed CORS configuration for CapRover
app.add_middleware( app.add_middleware(
...@@ -336,6 +338,74 @@ def create_app() -> FastAPI: ...@@ -336,6 +338,74 @@ def create_app() -> FastAPI:
return {"status": "processing_started", "message": "The curriculum is being processed in the background."} return {"status": "processing_started", "message": "The curriculum is being processed in the background."}
@app.post("/mcq/generate")
async def generate_mcqs_handler(
request: Request,
grade: int = Form(...),
subject: str = Form(...),
unit: str = Form(...),
concept: str = Form(...),
count: int = Form(5),
is_arabic: bool = Form(False)
):
"""
Generates and stores a new set of MCQs for a specific topic.
"""
container = request.app.state.container
try:
generated_questions = container.agent_service.generate_and_store_mcqs(
grade=grade,
subject=subject,
unit=unit,
concept=concept,
num_questions=count,
is_arabic=is_arabic
)
return {
"status": "success",
"message": f"Successfully generated and stored {len(generated_questions)} MCQs.",
"questions": generated_questions
}
except HTTPException as e:
raise e # Re-raise FastAPI specific exceptions
except Exception as e:
logger.error(f"Error in generate_mcqs_handler: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/mcq")
async def get_mcqs_handler(
request: Request,
grade: int,
subject: str,
unit: str,
concept: str,
is_arabic: bool,
# Make limit optional. If not provided, it will be None.
limit: Optional[int] = None
):
"""
Retrieves existing MCQs for a specific topic and language from the database.
If no limit is provided, retrieves all questions.
"""
container = request.app.state.container
try:
questions = container.agent_service.pgvector.get_mcqs(
grade=grade,
subject=subject,
unit=unit,
concept=concept,
is_arabic=is_arabic,
limit=limit # Pass the limit (which could be None)
)
return {
"status": "success",
"count": len(questions),
"questions": questions
}
except Exception as e:
logger.error(f"Error in get_mcqs_handler: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.options("/get-audio-response") @app.options("/get-audio-response")
async def audio_response_options(): async def audio_response_options():
"""Handle preflight CORS requests for audio response endpoint""" """Handle preflight CORS requests for audio response endpoint"""
......
...@@ -19,7 +19,7 @@ import json ...@@ -19,7 +19,7 @@ import json
import numpy as np import numpy as np
from openai import OpenAI from openai import OpenAI
from typing import List, Dict, Union, Any, Optional, Tuple from typing import List, Dict, Union, Any, Optional, Tuple
from pydantic import BaseModel, Field from pydantic import BaseModel, Field, ValidationError
import google.generativeai as genai import google.generativeai as genai
from google.generativeai import types from google.generativeai import types
import csv import csv
...@@ -27,7 +27,7 @@ import logging ...@@ -27,7 +27,7 @@ import logging
# Configure logging # Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# You can change level=logging.DEBUG for more verbose output during debugging # change level=logging.DEBUG for more verbose output during debugging
# ========================= # =========================
# 1. Initialization and Setup # 1. Initialization and Setup
...@@ -432,9 +432,6 @@ Output must be ONLY a valid JSON object (no markdown, no extra text) conforming ...@@ -432,9 +432,6 @@ Output must be ONLY a valid JSON object (no markdown, no extra text) conforming
) )
initial_json_string = resp_initial.text.strip() initial_json_string = resp_initial.text.strip()
initial_data = json.loads(initial_json_string) initial_data = json.loads(initial_json_string)
# Add internal fields
initial_data['_page_count'] = pdf_total_pages
initial_data['_source_file'] = "N/A" # Placeholder
book_structure = BookStructure(**initial_data) book_structure = BookStructure(**initial_data)
logging.info("✅ Gemini initial structure extraction successful.") logging.info("✅ Gemini initial structure extraction successful.")
...@@ -707,23 +704,54 @@ class EmbeddingProcessor: ...@@ -707,23 +704,54 @@ class EmbeddingProcessor:
logging.error(f"❌ Error generating embedding for text: '{text[:100]}...' - {str(e)}", exc_info=True) logging.error(f"❌ Error generating embedding for text: '{text[:100]}...' - {str(e)}", exc_info=True)
return [0.0] * Config.OPENAI_EMBEDDING_DIMENSION return [0.0] * Config.OPENAI_EMBEDDING_DIMENSION
def detect_arabic_text(self, text: str) -> bool: def find_context_for_page(self, page_num: int, structured_data: Dict[str, Any]) -> Optional[Dict[str, str]]:
"""Simple detection of Arabic text based on character ranges.""" """
if not text or pd.isna(text): Robust hierarchical lookup. It first tries to find a specific Lesson,
return False then falls back to finding a Concept, and finally a Unit.
text = str(text) """
arabic_chars = 0 # --- Level 1: Try to find an exact LESSON match ---
total_chars = 0 for unit_data in structured_data.get("units", []):
for char in text: unit_name = unit_data.get("unit_name", "Unknown Unit")
if char.strip(): for concept_data in unit_data.get("concepts", []):
total_chars += 1 concept_name = concept_data.get("concept_name", "Unknown Concept")
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') or \ for lesson_data in concept_data.get("lessons", []):
('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') or \ pages = lesson_data.get("pages")
('\uFE70' <= char <= '\uFEFF'): if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
arabic_chars += 1 return {
return total_chars > 0 and (arabic_chars / total_chars) > Config.ARABIC_CHAR_THRESHOLD 'Unit': unit_name,
'Concept': concept_name,
def process_structured_data_for_embeddings( 'Lesson': lesson_data.get("lesson_name", "Unknown Lesson")
}
# --- Level 2: If no lesson matched, try a CONCEPT match ---
# This perfectly handles your scenario where Concept ranges are good but Lessons are bad.
for unit_data in structured_data.get("units", []):
unit_name = unit_data.get("unit_name", "Unknown Unit")
for concept_data in unit_data.get("concepts", []):
pages = concept_data.get("pages")
# We check if the page is within the CONCEPT'S overall range
if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
return {
'Unit': unit_name,
'Concept': concept_data.get("concept_name", "Unknown Concept"),
# We label it as general content for this concept since it didn't match a specific lesson
'Lesson': f"General content for {concept_data.get('concept_name', 'Concept')}"
}
# --- Level 3: If still nothing, try a UNIT match ---
for unit_data in structured_data.get("units", []):
pages = unit_data.get("pages")
if pages and pages.get('start_page', 0) <= page_num <= pages.get('end_page', 0):
return {
'Unit': unit_data.get("unit_name", "Unknown Unit"),
'Concept': "General Unit Content",
'Lesson': f"General content for {unit_data.get('unit_name', 'Unit')}"
}
return None # Page completely outside any known structure
def process_all_pages_for_embeddings_gapless(
self, self,
structured_data: Dict[str, Any], structured_data: Dict[str, Any],
page_texts: Dict[int, str], page_texts: Dict[int, str],
...@@ -733,75 +761,70 @@ class EmbeddingProcessor: ...@@ -733,75 +761,70 @@ class EmbeddingProcessor:
output_csv_path: str output_csv_path: str
): ):
""" """
Takes Gemini-extracted data and generates embeddings. Processes EVERY page from the OCR output to guarantee 100% content coverage.
This version has a more robust repair logic for missing page numbers. It uses the Gemini structure as a lookup to label pages, and applies the last
known label to any pages that fall in gaps within the structure.
""" """
if not structured_data or not structured_data.get("units"): if not page_texts:
logging.warning("❌ No structured units found for embedding.") logging.warning("❌ No page texts available to process for embeddings.")
return return
logging.info(f"Generating robust embeddings for grade {grade}, {lang} content...") logging.info(f"🚀 Starting GAPLESS embedding generation for Grade {grade}, Subject {subject}.")
output_rows = [] output_rows = []
is_arabic = (lang == "arabic") is_arabic = (lang == "arabic")
total_pages = max(page_texts.keys()) if page_texts else 0
last_known_page = 0 # Initialize a fallback context for pages that are not in the structure (gaps)
last_known_context = {
for unit_data in tqdm(structured_data.get("units", []), desc="Processing Units for Embeddings"): 'Unit': 'Uncategorized',
unit_name = unit_data.get("unit_name", "Unknown Unit") 'Concept': 'Uncategorized',
'Lesson': 'Uncategorized'
for concept_data in unit_data.get("concepts", []): }
concept_name = concept_data.get("concept_name", "Unknown Concept")
# The main loop iterates through ALL pages, guaranteeing no gaps.
for lesson_data in concept_data.get("lessons", []): for page_num in tqdm(sorted(page_texts.keys()), desc="Embedding All Pages"):
lesson_name = lesson_data.get("lesson_name", "Unknown Lesson") page_text = page_texts[page_num]
pages = lesson_data.get("pages") if not page_text.strip():
continue # Skip empty pages
start_page, end_page = 0, 0
# Find the context for the current page from Gemini's structure
if pages and pages.get('start_page', 0) > 0 and pages.get('end_page', 0) > 0: context = self.find_context_for_page(page_num, structured_data)
# This is the "happy path": Gemini gave us valid pages
start_page, end_page = pages['start_page'], pages['end_page'] if context:
else: # If found, this is our new "last known" good context
# --- THIS IS THE NEW, SMARTER REPAIR LOGIC --- last_known_context = context
# Suggest a start page based on the last known page current_context = context
suggested_start = last_known_page + 1 else:
# If not found (page is in a gap), use the last valid context we saw.
# If our suggestion is already past the end of the book, we can't continue. logging.warning(f" -> Page {page_num} not in structure. Applying last known context: '{last_known_context['Lesson']}'")
if suggested_start > total_pages: current_context = last_known_context
logging.warning(f" -> Skipping Lesson '{lesson_name}': Suggested start ({suggested_start}) is beyond total pages ({total_pages}).")
continue # Move to the next lesson # Chunk the text of the single page
for chunk_idx, chunk_text in enumerate(self.chunk_text(page_text, is_arabic)):
start_page = suggested_start if not chunk_text:
# Ensure the end page is at least the start page, and not past the end of the book. continue
end_page = min(start_page + 4, total_pages)
logging.warning(f" -> Repairing Lesson '{lesson_name}': Applying default pages [{start_page}-{end_page}]") output_rows.append({
'Grade': grade,
# Final safety check, although the logic above should prevent this. 'Subject': subject,
if start_page > end_page: 'Unit': current_context['Unit'],
logging.error(f" -> CRITICAL SKIP for Lesson '{lesson_name}': Invalid final page range [{start_page}-{end_page}]") 'Concept': current_context['Concept'],
continue 'Lesson': current_context['Lesson'],
'From page': page_num, # Metadata is now per-page
# Update the tracker for the next iteration with a valid page number 'To page': page_num,
last_known_page = end_page 'Chunk index': chunk_idx,
'Chunk text': chunk_text,
lesson_full_text = " ".join([page_texts.get(p, "") for p in range(start_page, end_page + 1)]) 'Is Arabic': is_arabic,
'Embedding': json.dumps(self.get_embedding(chunk_text))
for chunk_idx, chunk_text in enumerate(self.chunk_text(lesson_full_text, is_arabic)): })
if not chunk_text: continue
output_rows.append({
'Grade': grade, 'Subject': subject, 'Unit': unit_name, 'Concept': concept_name,
'Lesson': lesson_name, 'From page': start_page, 'To page': end_page,
'Chunk index': chunk_idx, 'Chunk text': chunk_text, 'Is Arabic': is_arabic,
'Embedding': json.dumps(self.get_embedding(chunk_text))
})
df = pd.DataFrame(output_rows) df = pd.DataFrame(output_rows)
df.to_csv(output_csv_path, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8-sig") df.to_csv(output_csv_path, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8-sig")
logging.info(f"✅ Embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated)") logging.info(f"✅ Gapless embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated from {len(page_texts)} pages)")
# process_pdf_pipline.py # =========================
# 8. Main Pipeline Function
# =========================
def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, total_pages: int) -> Dict[str, Any]: def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, total_pages: int) -> Dict[str, Any]:
logging.warning("🔧 Sanitizing and repairing Gemini's output...") logging.warning("🔧 Sanitizing and repairing Gemini's output...")
...@@ -831,16 +854,15 @@ def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, tota ...@@ -831,16 +854,15 @@ def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, tota
return gemini_output return gemini_output
# =========================
# 8. Main Pipeline Function
# =========================
def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path: str, output_embeddings_csv_path: str, remove_lessons: bool = False): def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path: str, output_embeddings_csv_path: str, remove_lessons: bool = False):
logging.info(f"\n--- Starting Pipeline for {pdf_path} (Grade: {grade}, Subject: {subject}) ---") logging.info(f"\n--- Starting Pipeline for {pdf_path} (Grade: {grade}, Subject: {subject}) ---")
gemini_raw_output = {}
try: try:
page_texts, lang,_, tracked_titles = process_pdf_to_text(pdf_path) page_texts, lang, _, tracked_titles = process_pdf_to_text(pdf_path)
if not page_texts: return if not page_texts:
logging.critical("❌ CRITICAL: No text could be extracted from the PDF. Aborting.")
return
pdf_total_pages = max(page_texts.keys()) pdf_total_pages = max(page_texts.keys())
toc_contents = extract_toc_pages_from_first_n(page_texts, lang) toc_contents = extract_toc_pages_from_first_n(page_texts, lang)
...@@ -871,7 +893,9 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path: ...@@ -871,7 +893,9 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
if openai_client: if openai_client:
embedding_processor = EmbeddingProcessor(client=openai_client) embedding_processor = EmbeddingProcessor(client=openai_client)
embedding_processor.process_structured_data_for_embeddings(
# --- THIS IS THE FINAL, ROBUST CALL ---
embedding_processor.process_all_pages_for_embeddings_gapless(
gemini_structured_data, gemini_structured_data,
page_texts, page_texts,
lang, lang,
...@@ -879,7 +903,8 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path: ...@@ -879,7 +903,8 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
subject, subject,
output_embeddings_csv_path output_embeddings_csv_path
) )
except Exception as e: except Exception as e:
logging.critical(f"Pipeline error: {e}", exc_info=True) logging.critical(f"Pipeline error: {e}", exc_info=True)
logging.info(f"\n--- Pipeline finished for {pdf_path} ---") logging.info(f"\n--- Pipeline finished for {pdf_path} ---")
\ No newline at end of file
...@@ -3,6 +3,7 @@ import os ...@@ -3,6 +3,7 @@ import os
from typing import List, Dict, Optional from typing import List, Dict, Optional
from fastapi import HTTPException from fastapi import HTTPException
import sys import sys
import json
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from core import StudentNationality, Models from core import StudentNationality, Models
...@@ -94,3 +95,132 @@ class AgentService: ...@@ -94,3 +95,132 @@ class AgentService:
except Exception as e: except Exception as e:
logger.error(f"Error closing connection pools: {e}") logger.error(f"Error closing connection pools: {e}")
def generate_and_store_mcqs(
self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, num_questions: int = 5
) -> List[Dict]:
"""
Generates NEW, UNIQUE MCQs for a topic by first retrieving existing ones
and instructing the AI to avoid generating duplicates.
"""
if not self.pgvector:
raise HTTPException(status_code=503, detail="Vector service is not available for context retrieval.")
# === STEP 1: RETRIEVE EXISTING QUESTIONS ===
logger.info(f"Checking for existing questions for: {grade}/{subject}/{unit}/{concept}")
existing_questions = self.pgvector.get_mcqs(
grade, subject, unit, concept, is_arabic, limit=None # Fetch ALL existing questions
)
existing_questions_text = "No existing questions found."
if existing_questions:
# Format the existing questions into a simple list for the prompt
q_list = [f"- {q['question_text']}" for q in existing_questions]
existing_questions_text = "\n".join(q_list)
logger.info(f"Found {len(existing_questions)} existing questions. Will instruct AI to generate different ones.")
# === STEP 2: RETRIEVE CURRICULUM CONTEXT ===
search_query = f"summary of {concept} in {unit}"
query_embedding = self.openai_service.generate_embedding(search_query)
context_chunks = self.pgvector.search_filtered_nearest(
query_embedding, grade, subject, is_arabic, limit=10
)
if not context_chunks:
raise HTTPException(status_code=404, detail="No curriculum context found for this topic in the specified language.")
full_context = "\n---\n".join([chunk['chunk_text'] for chunk in context_chunks])
# === STEP 3: CREATE THE ADVANCED, AWARE PROMPT ===
if is_arabic:
prompt = f"""
أنت خبير في تطوير المناهج ومهمتك إنشاء أسئلة اختيار من متعدد جديدة ومختلفة.
هذه هي الأسئلة الموجودة حاليًا في قاعدة البيانات حول المفهوم "{concept}":
--- الأسئلة الحالية ---
{existing_questions_text}
--- نهاية الأسئلة الحالية ---
اعتمادًا فقط على السياق التالي من المنهج:
--- السياق ---
{full_context}
--- نهاية السياق ---
يرجى توليد {num_questions} من أسئلة الاختيار من متعدد **الجديدة والمختلفة تمامًا** عن الأسئلة الموجودة أعلاه.
يجب أن تكون كل الأسئلة قابلة للإجابة مباشرة من السياق المقدم.
يجب أن يكون ردك مصفوفة JSON صحيحة. كل كائن يجب أن يحتوي على المفاتيح التالية:
- "question_text": نص السؤال.
- "correct_answer": الإجابة الصحيحة.
- "wrong_answer_1": إجابة خاطئة.
- "wrong_answer_2": إجابة خاطئة.
- "wrong_answer_3": إجابة خاطئة.
لا تكتب أي نص أو شرح خارج مصفوفة الـ JSON.
"""
else:
prompt = f"""
You are an expert curriculum developer tasked with creating new and unique multiple-choice questions.
Here are the questions that ALREADY EXIST in the database for the concept "{concept}":
--- EXISTING QUESTIONS ---
{existing_questions_text}
--- END EXISTING QUESTIONS ---
Based ONLY on the following context from the curriculum:
--- CONTEXT ---
{full_context}
--- END CONTEXT ---
Please generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions from the list of existing ones above.
Each question must be answerable directly from the provided context. The questions and all answers MUST be in English.
Your response MUST be a valid JSON array of objects with these keys:
- "question_text"
- "correct_answer"
- "wrong_answer_1"
- "wrong_answer_2"
- "wrong_answer_3"
Do not include any text outside of the JSON array.
"""
# === STEP 4 & 5: CALL LLM, PARSE, and STORE (No changes here) ===
try:
# ... (The entire try/except block for calling the LLM remains exactly the same)
response = self.openai_service.client.chat.completions.create(
model=Models.chat,
messages=[{"role": "user", "content": prompt}],
temperature=0.5, # Slightly higher temp for more creativity
response_format={"type": "json_object"}
)
response_content = response.choices[0].message.content
json_response = json.loads(response_content)
generated_questions = []
for key, value in json_response.items():
if isinstance(value, list):
generated_questions = value
break
if not generated_questions:
raise ValueError("LLM did not return a list of questions in the JSON response.")
except (json.JSONDecodeError, ValueError, KeyError) as e:
logger.error(f"Failed to parse MCQ response from LLM: {e}\nRaw Response: {response_content}")
raise HTTPException(status_code=500, detail="Failed to generate or parse MCQs from AI.")
mcqs_to_store = []
for q in generated_questions:
mcqs_to_store.append({
"grade": grade, "is_arabic": is_arabic, "subject": subject,
"unit": unit, "concept": concept, "question_text": q["question_text"],
"correct_answer": q["correct_answer"], "wrong_answer_1": q["wrong_answer_1"],
"wrong_answer_2": q["wrong_answer_2"], "wrong_answer_3": q["wrong_answer_3"],
})
self.pgvector.insert_mcqs(mcqs_to_store)
return mcqs_to_store
\ No newline at end of file
...@@ -523,4 +523,60 @@ class PGVectorService: ...@@ -523,4 +523,60 @@ class PGVectorService:
print("="*50) print("="*50)
except Exception as e: except Exception as e:
print(f"❌ Database verification failed: {e}") print(f"❌ Database verification failed: {e}")
\ No newline at end of file
def insert_mcqs(self, mcq_list: List[Dict]):
"""
Inserts a batch of MCQs, now including the language flag.
"""
if not mcq_list:
return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# --- UPDATED QUERY ---
insert_query = """
INSERT INTO mcq_questions (
grade, is_arabic, subject, unit, concept, question_text,
correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""
# --- UPDATED DATA PREPARATION ---
data_to_insert = [
(
q['grade'], q['is_arabic'], q['subject'], q['unit'], q['concept'],
q['question_text'], q['correct_answer'], q['wrong_answer_1'],
q['wrong_answer_2'], q['wrong_answer_3']
) for q in mcq_list
]
cur.executemany(insert_query, data_to_insert)
conn.commit()
logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")
def get_mcqs(self, grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
"""
Retrieves MCQs for a specific topic and language.
If limit is None, it retrieves all matching questions.
"""
with self.pool_handler.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Dynamically build the query based on the limit
query = """
SELECT id, question_text, correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
FROM mcq_questions
WHERE grade = %s AND subject = %s AND unit = %s AND concept = %s AND is_arabic = %s
ORDER BY created_at DESC
"""
params = (grade, subject, unit, concept, is_arabic)
if limit is not None:
query += " LIMIT %s;"
params += (limit,)
else:
query += ";"
cur.execute(query, params)
return cur.fetchall()
\ No newline at end of file
import psycopg2
import os
from dotenv import load_dotenv
load_dotenv()
def setup_mcq_table(drop_existing_table: bool = False):
"""
Sets up the mcq_questions table in the database.
"""
try:
conn = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", "5432"),
user=os.getenv("POSTGRES_USER"),
password=os.getenv("POSTGRES_PASSWORD"),
dbname=os.getenv("POSTGRES_DB")
)
conn.autocommit = True
with conn.cursor() as cur:
if drop_existing_table:
print("Dropping existing mcq_questions table...")
cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
print("Table dropped.")
print("Creating mcq_questions table...")
# --- THIS IS THE UPDATED TABLE SCHEMA ---
cur.execute("""
CREATE TABLE IF NOT EXISTS mcq_questions (
id SERIAL PRIMARY KEY,
grade INTEGER NOT NULL,
is_arabic BOOLEAN NOT NULL, -- <-- ADDED THIS LINE
subject TEXT NOT NULL,
unit TEXT NOT NULL,
concept TEXT NOT NULL,
question_text TEXT NOT NULL,
correct_answer TEXT NOT NULL,
wrong_answer_1 TEXT NOT NULL,
wrong_answer_2 TEXT NOT NULL,
wrong_answer_3 TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("Creating indexes on mcq_questions table...")
# --- THIS IS THE UPDATED INDEX ---
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_mcq_topic
ON mcq_questions(grade, is_arabic, subject, unit, concept); -- <-- ADDED is_arabic
""")
print("MCQ table setup complete.")
except Exception as e:
print(f"An error occurred during MCQ table setup: {e}")
finally:
if 'conn' in locals() and conn:
conn.close()
print("Database connection closed.")
if __name__ == "__main__":
# To apply the changes, it's best to drop and recreate the table.
# Be careful if you have existing data you want to keep!
print("Creating MCQ table...")
setup_mcq_table(drop_existing_table=False)
\ No newline at end of file
...@@ -10,6 +10,8 @@ python insert_csv_embeddings.py ...@@ -10,6 +10,8 @@ python insert_csv_embeddings.py
echo "Database setup complete." echo "Database setup complete."
python curriculum_structure.py python curriculum_structure.py
echo "Curriculum structure setup complete." echo "Curriculum structure setup complete."
python setup_mcq_table.py
echo "MCQ table setup complete."
sleep 5 sleep 5
# Start the web server and keep it as the main process # Start the web server and keep it as the main process
......
"""
======================================================================
MCQ API Cookbook & Test Script
======================================================================
Purpose:
This script serves as both a live integration test and a practical guide ("cookbook")
for using the Multiple-Choice Question (MCQ) generation and retrieval API endpoints.
It demonstrates how to:
1. Generate and store new MCQs for a specific curriculum topic.
2. Retrieve existing MCQs from the database for that same topic.
----------------------------------------------------------------------
API Endpoints Guide
----------------------------------------------------------------------
There are two main endpoints for the MCQ feature:
1. Generate Questions (POST /mcq/generate)
------------------------------------------
This is the "creator" endpoint. It uses an AI model to generate a new set of questions
based on the curriculum content stored in the vector database. It then saves these
new questions to the `mcq_questions` table for future use.
- Method: POST
- URL: [BASE_URL]/mcq/generate
- Data Format: Must be sent as `application/x-www-form-urlencoded` (form data).
Parameters (Form Data):
- grade (int, required): The grade level of the curriculum (e.g., 4).
- subject (str, required): The subject of the curriculum (e.g., "Science").
- unit (str, required): The exact name of the unit.
- concept (str, required): The exact name of the concept.
- is_arabic (bool, required): Set to `true` for Arabic curriculum, `false` for English.
- count (int, optional, default=5): The number of new questions to generate.
Example Usage (using cURL):
curl -X POST [BASE_URL]/mcq/generate \
-F "grade=4" \
-F "subject=Science" \
-F "unit=الوحدة الأولى: الأنظمة الحية" \
-F "concept=المفهوم الأول: التكيف والبقاء" \
-F "is_arabic=true" \
-F "count=3"
2. Retrieve Questions (GET /mcq)
---------------------------------
This is the "reader" endpoint. It quickly and cheaply retrieves questions that have
already been generated and stored in the database. It does NOT call the AI model.
- Method: GET
- URL: [BASE_URL]/mcq
Parameters (URL Query Parameters):
- grade (int, required): The grade level.
- subject (str, required): The subject.
- unit (str, required): The unit name.
- concept (str, required): The concept name.
- is_arabic (bool, required): `true` for Arabic, `false` for English.
- limit (int, optional, default=None): The maximum number of questions to retrieve.
If omitted, it will retrieve ALL questions for that topic.
Example Usage (using cURL):
# Get the 5 most recent questions for a topic
curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true&limit=5"
# Get ALL questions for a topic
curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true"
----------------------------------------------------------------------
How to Run This Script
----------------------------------------------------------------------
1. Ensure your FastAPI server is running.
2. Make sure the BASE_URL variable below is set to your server's address.
3. Run the script from your terminal: python3 msq_test.py
"""
import requests
import json
import time
from typing import Optional
# The base URL of your API server.
BASE_URL = "https://voice-agent.caprover.al-arcade.com"
def test_mcq_generation(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, count: int):
"""
Tests the POST /mcq/generate endpoint.
"""
endpoint = f"{BASE_URL}/mcq/generate"
payload = {
"grade": grade,
"subject": subject,
"unit": unit,
"concept": concept,
"is_arabic": is_arabic,
"count": count,
}
print(f">> Attempting to GENERATE {count} new questions for:")
print(f" Topic: Grade {grade} {subject} -> {unit} -> {concept}")
print(f" Language: {'Arabic' if is_arabic else 'English'}")
try:
response = requests.post(endpoint, data=payload, timeout=120)
if response.status_code == 200:
print(f"SUCCESS: API returned status code {response.status_code}")
data = response.json()
print(f" Message: {data.get('message')}")
if 'questions' in data and data['questions']:
print("\n --- Details of Generated Questions ---")
for i, q in enumerate(data['questions'], 1):
print(f" {i}. Question: {q['question_text']}")
print(f" Correct: {q['correct_answer']}")
print(f" Wrong 1: {q['wrong_answer_1']}")
print(f" Wrong 2: {q['wrong_answer_2']}")
print(f" Wrong 3: {q['wrong_answer_3']}\n")
return True
else:
print(f"FAILED: API returned status code {response.status_code}")
try:
error_data = response.json()
print(f" Error Detail: {error_data.get('detail', 'No detail provided.')}")
except json.JSONDecodeError:
print(f" Response was not valid JSON: {response.text}")
return False
except requests.exceptions.RequestException as e:
print(f"FAILED: An error occurred while making the request: {e}")
return False
def test_mcq_retrieval(grade: int, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int]):
"""
Tests the GET /mcq endpoint with detailed output.
"""
endpoint = f"{BASE_URL}/mcq"
params = {
"grade": grade,
"subject": subject,
"unit": unit,
"concept": concept,
"is_arabic": is_arabic,
}
if limit is not None:
params["limit"] = limit
limit_str = f"up to {limit}" if limit is not None else "ALL"
print(f">> Attempting to RETRIEVE {limit_str} stored questions for the same topic...")
try:
response = requests.get(endpoint, params=params, timeout=30)
if response.status_code == 200:
print(f"SUCCESS: API returned status code {response.status_code}")
data = response.json()
print(f" Found {data.get('count')} stored questions in the database.")
if 'questions' in data and data['questions']:
print("\n --- Details of Retrieved Questions ---")
for i, q in enumerate(data['questions'], 1):
print(f" {i}. Question: {q['question_text']}")
print(f" Correct: {q['correct_answer']}")
print(f" Wrong 1: {q['wrong_answer_1']}")
print(f" Wrong 2: {q['wrong_answer_2']}")
print(f" Wrong 3: {q['wrong_answer_3']}\n")
elif data.get('count') == 0:
print(" (This is expected if this is the first time generating questions for this topic)")
return True
else:
print(f"FAILED: API returned status code {response.status_code}")
try:
error_data = response.json()
print(f" Error Detail: {error_data.get('detail', 'No detail provided.')}")
except json.JSONDecodeError:
print(f" Response was not valid JSON: {response.text}")
return False
except requests.exceptions.RequestException as e:
print(f"FAILED: An error occurred while making the request: {e}")
return False
if __name__ == "__main__":
print("\n" + "="*50)
print("STARTING TEST 1: ARABIC MCQ GENERATION & RETRIEVAL")
print("="*50)
# IMPORTANT: Use actual Unit/Concept names from your database for the best results.
arabic_test_data = {
"grade": 4,
"subject": "Science",
"unit": "الوحدة الأولى: الأنظمة الحية",
"concept": "المفهوم الأول: التكيف والبقاء",
"is_arabic": True,
"count": 3
}
generation_successful = test_mcq_generation(**arabic_test_data)
if generation_successful:
print("-" * 25)
time.sleep(2)
test_mcq_retrieval(limit=None, **{k:v for k,v in arabic_test_data.items() if k != 'count'})
print("\n" + "="*50)
print("STARTING TEST 2: ENGLISH MCQ GENERATION & RETRIEVAL")
print("="*50)
english_test_data = {
"grade": 5,
"subject": "Science",
"unit": "Unit 1: Matter and Energy in Ecosystems",
"concept": "Concept 1.1: Properties of Matter",
"is_arabic": False,
"count": 2
}
generation_successful = test_mcq_generation(**english_test_data)
if generation_successful:
print("-" * 25)
time.sleep(2)
test_mcq_retrieval(limit=None, **{k:v for k,v in english_test_data.items() if k != 'count'})
print("\n" + "="*50)
print("All tests complete.")
print("="*50)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment