Commit 67972c15 authored by salma's avatar salma

faster and cheeper mcq generation

parent ff35d000
......@@ -49,54 +49,100 @@ class MCQService:
is_arabic: bool, num_questions: int = 5
) -> List[Dict]:
"""
Generates NEW, UNIQUE MCQs with balanced difficulty and Bloom's taxonomy levels.
Each returned question includes:
- difficulty_level: 1–10
- blooms_level: One of ["remember", "understand", "apply", "analysis", "evaluate", "create"]
Generates NEW, UNIQUE MCQs using a 'Generate-Filter-Retry' loop with Vector Semantic Deduplication.
This ensures zero redundancy without sending conversation history to the LLM.
"""
if not self.pgvector:
raise HTTPException(status_code=503, detail="Vector service is not available for context retrieval.")
logger.info(f"Checking for existing questions for: {curriculum}/{grade}/{subject}/{unit}/{concept}")
existing_questions = self.pgvector.get_mcqs(
curriculum, grade, subject, unit, concept, is_arabic, limit=None
)
existing_questions_text = "No existing questions found."
if existing_questions:
q_list = [f"- {q['question_text']}" for q in existing_questions]
existing_questions_text = "\n".join(q_list)
# 1. Retrieve a pool of textbook context chunks (fetched once, rotated in loop)
context_chunks = self._retrieve_context_pool(grade, subject, unit, concept, is_arabic)
accepted_mcqs = []
attempts = 0
max_attempts = 3 # Safety brake
# 2. Loop until we have enough UNIQUE questions
while len(accepted_mcqs) < num_questions and attempts < max_attempts:
attempts += 1
questions_needed = num_questions - len(accepted_mcqs)
# Ask for slightly more than needed to buffer against duplicates being filtered out
# (e.g., if we need 1, ask for 2. If we need 5, ask for 7)
ask_count = questions_needed + 2
# A. Context Rotation: Pick random chunks to ensure variety in every loop
# This prevents the LLM from fixating on the same part of the text.
if len(context_chunks) >= 3:
selected_chunks = random.sample(context_chunks, min(len(context_chunks), 4))
else:
selected_chunks = context_chunks
current_context_text = "\n---\n".join([c['chunk_text'] for c in selected_chunks])
# B. Generate Raw Questions from LLM
# (Uses your detailed prompts regarding Bloom's and Difficulty)
raw_candidates = self._generate_candidate_batch(
current_context_text, ask_count, grade, is_arabic
)
# C. Filter & Embed: Checks DB for semantic duplicates
valid_batch = self._filter_and_process_questions(
raw_candidates, accepted_mcqs, curriculum, grade, subject, unit, concept, is_arabic
)
# --- STEP 2: CONTEXT RETRIEVAL ---
accepted_mcqs.extend(valid_batch)
# 3. Store Final Result
if accepted_mcqs:
# Slice to exact number requested (in case we generated extras)
final_set = accepted_mcqs[:num_questions]
self.pgvector.insert_mcqs(final_set)
return final_set
# If we failed to generate enough
if not accepted_mcqs:
logger.warning(f"Could not generate unique questions for {concept} after {attempts} attempts.")
return []
return accepted_mcqs
def _retrieve_context_pool(self, grade, subject, unit, concept, is_arabic):
"""Fetches a larger pool of relevant textbook content to allow for rotation."""
search_query = f"summary of {concept} in {unit} for {subject}"
query_embedding = self.openai_service.generate_embedding(search_query)
query_emb = self.openai_service.generate_embedding(search_query)
try:
grade_for_search = self._extract_grade_integer(grade)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
context_chunks = self.pgvector.search_filtered_nearest(
query_embedding, grade_for_search, subject, is_arabic, limit=10
grade_int = self._extract_grade_integer(grade)
except ValueError:
grade_int = 0
# Fetch 15 chunks (instead of 10) so we can rotate through them
chunks = self.pgvector.search_filtered_nearest(
query_emb, grade_int, subject, is_arabic, limit=15
)
if not context_chunks:
raise HTTPException(status_code=404, detail="No curriculum context found for this topic.")
full_context = "\n---\n".join([chunk['chunk_text'] for chunk in context_chunks])
if not chunks:
# Fallback text if DB is empty
return [{"chunk_text": f"Topic summary: {concept} in {unit}"}]
return chunks
# --- STEP 3: PROMPT CONSTRUCTION ---
def _generate_candidate_batch(self, context_text: str, count: int, grade: str, is_arabic: bool) -> List[Dict]:
"""
Calls LLM to generate questions using the detailed, original prompts
to ensure difficulty balance and Bloom's taxonomy coverage.
"""
if is_arabic:
prompt = f"""
أنت خبير في تطوير المناهج التعليمية، ومهمتك هي إنشاء **أسئلة اختيار من متعدد جديدة بالكامل**.
هذه هي الأسئلة الموجودة بالفعل لمفهوم "{concept}":
--- الأسئلة الموجودة ---
{existing_questions_text}
--- نهاية الأسئلة الموجودة ---
استنادًا فقط إلى المعلومات التالية:
استنادًا فقط إلى المعلومات النصية التالية:
--- السياق ---
{full_context}
{context_text}
--- نهاية السياق ---
قم بإنشاء {num_questions} سؤالًا جديدًا تمامًا من نوع الاختيار من متعدد (MCQ)، **مختلفة كليًا عن الأسئلة الموجودة أعلاه**.
قم بإنشاء {count} سؤالًا جديدًا تمامًا من نوع الاختيار من متعدد (MCQ).
⚠️ **مهم جدًا**:
يجب أن تشمل الأسئلة مستويات متنوعة من الصعوبة وفق التوزيع التالي:
......@@ -127,19 +173,14 @@ class MCQService:
"""
else:
prompt = f"""
You are an expert curriculum developer. Your task is to generate **entirely new multiple-choice questions (MCQs)** that do NOT overlap with any existing ones.
Here are the questions that ALREADY EXIST for the concept "{concept}":
--- EXISTING QUESTIONS ---
{existing_questions_text}
--- END EXISTING QUESTIONS ---
You are an expert curriculum developer. Your task is to generate **entirely new multiple-choice questions (MCQs)**.
Based ONLY on the following context:
--- CONTEXT ---
{full_context}
{context_text}
--- END CONTEXT ---
Generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions.
Generate {count} NEW multiple-choice questions.
⚠️ **Important Requirements**:
- Distribute difficulty levels approximately as follows:
......@@ -171,7 +212,6 @@ class MCQService:
Do not include any text outside the JSON array.
"""
# --- STEP 4: CALL LLM ---
try:
response = self.openai_service.client.chat.completions.create(
model=Models.chat,
......@@ -182,59 +222,77 @@ class MCQService:
response_content = response.choices[0].message.content
json_response = json.loads(response_content)
# +++ THIS IS THE NEW, MORE ROBUST PARSING LOGIC +++
generated_questions = []
# Robust Parsing Logic
if isinstance(json_response, list):
# Case 1: The root of the JSON is already a list of questions.
generated_questions = json_response
return json_response
elif isinstance(json_response, dict):
# Case 2: The root is a dictionary.
# First, try to find a list within the dictionary's values.
# Try to find a list inside values
found_list = next((v for v in json_response.values() if isinstance(v, list)), None)
if found_list:
generated_questions = found_list
# If no list is found, maybe the dictionary ITSELF is the single question.
elif "question_text" in json_response:
generated_questions = [json_response] # Wrap the single object in a list.
if found_list: return found_list
# Maybe the dict itself is the question
if "question_text" in json_response: return [json_response]
if not generated_questions:
# If we still have nothing, the format is truly unknown.
raise ValueError("LLM response did not contain a recognizable question list or object.")
except (json.JSONDecodeError, ValueError, KeyError, StopIteration) as e:
logger.error(f"Failed to parse MCQ response from LLM: {e}\nRaw Response: {response_content}")
raise HTTPException(status_code=500, detail="Failed to generate or parse MCQs from AI.")
# --- STEP 5: STORE ---
mcqs_to_store = []
for q in generated_questions:
mcqs_to_store.append({
"curriculum": curriculum,
"grade": grade,
"subject": subject,
"unit": unit,
"concept": concept,
"is_arabic": is_arabic,
"difficulty_level": q.get("difficulty_level"),
"blooms_level": q.get("blooms_level"),
"question_text": q.get("question_text"),
"question_type": q.get("question_type", "multiple_choice"),
"correct_answer": q.get("correct_answer"),
"wrong_answer_1": q.get("wrong_answer_1"),
"wrong_answer_2": q.get("wrong_answer_2"),
"wrong_answer_3": q.get("wrong_answer_3"),
"wrong_answer_4": q.get("wrong_answer_4"),
"hint": q.get("hint"),
"question_image_url": q.get("question_image_url", ""),
"correct_image_url": q.get("correct_image_url", ""),
"wrong_image_url_1": q.get("wrong_image_url_1", ""),
"wrong_image_url_2": q.get("wrong_image_url_2", ""),
"wrong_image_url_3": q.get("wrong_image_url_3", ""),
"wrong_image_url_4": q.get("wrong_image_url_4", ""),
})
self.pgvector.insert_mcqs(mcqs_to_store)
return mcqs_to_store
return []
except Exception as e:
logger.error(f"Failed to generate batch from LLM: {e}")
return []
def _filter_and_process_questions(
self, candidates: List[Dict], current_batch: List[Dict],
curriculum, grade, subject, unit, concept, is_arabic
) -> List[Dict]:
"""
1. Generates Embedding for new question.
2. Checks DB if similar vector exists.
3. Returns only unique questions with embedding attached.
"""
valid = []
for q in candidates:
q_text = q.get("question_text", "").strip()
if not q_text: continue
# 1. Local check (prevent duplicates within the current generation loop)
# Check against questions accepted in previous loops + valid ones in this loop
if any(x['question_text'] == q_text for x in current_batch + valid):
continue
try:
# 2. Generate Embedding (This creates the vector)
q_vector = self.openai_service.generate_embedding(q_text)
# 3. Check DB for semantic duplicate using the vector
is_duplicate = self.pgvector.check_similarity_existence(
vector=q_vector,
curriculum=curriculum,
concept=concept,
threshold=0.92 # 92% similarity means it's a duplicate
)
if is_duplicate:
logger.info(f"Vector duplicate filtered: {q_text[:30]}...")
continue
# 4. Add Metadata & Vector to the object for storage
q.update({
"curriculum": curriculum, "grade": grade, "subject": subject,
"unit": unit, "concept": concept, "is_arabic": is_arabic,
"embedding": q_vector, # Store vector!
"question_type": q.get("question_type", "multiple_choice"),
"question_image_url": q.get("question_image_url", ""),
"correct_image_url": q.get("correct_image_url", ""),
"wrong_image_url_1": q.get("wrong_image_url_1", ""),
"wrong_image_url_2": q.get("wrong_image_url_2", ""),
"wrong_image_url_3": q.get("wrong_image_url_3", ""),
"wrong_image_url_4": q.get("wrong_image_url_4", ""),
})
valid.append(q)
except Exception as e:
logger.error(f"Vector check failed for question '{q_text[:20]}': {e}")
continue
return valid
......@@ -297,7 +355,6 @@ class MCQService:
logger.warning(f"All questions for '{concept}' have been asked recently. Re-using full list.")
unasked_mcqs = all_mcqs
# --- THIS IS THE ROBUST TWO-STEP SELECTION LOGIC ---
# 4. STEP 1 (Filter with AI): Get a SUBSET of relevant questions.
relevant_question_texts = []
......
......@@ -543,45 +543,7 @@ class PGVectorService:
except Exception as e:
print(f"❌ Database verification failed: {e}")
def insert_mcqs(self, mcq_list: List[Dict]):
"""
Inserts a batch of MCQs, now including the blooms_level field.
"""
if not mcq_list:
return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# --- UPDATED INSERT QUERY ---
insert_query = """
INSERT INTO mcq_questions (
curriculum, grade, subject, unit, concept, question_text,
question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
question_image_url, correct_image_url, wrong_image_url_1,
wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s
);
"""
# --- UPDATED DATA PREPARATION ---
data_to_insert = [
(
q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
q.get('blooms_level'), # <-- ADDED THIS
q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
q.get('hint')
) for q in mcq_list
]
cur.executemany(insert_query, data_to_insert)
conn.commit()
logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")
def get_mcqs(self, curriculum: str, grade: str, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
"""
......@@ -668,4 +630,87 @@ class PGVectorService:
AND u->>'name' = %s
ORDER BY 1;
""", (curriculum, int(grade), subject, unit))
return [row[0] for row in cur.fetchall() if row[0]]
\ No newline at end of file
return [row[0] for row in cur.fetchall() if row[0]]
def check_similarity_existence(
self,
vector: List[float],
curriculum: str,
concept: str,
threshold: float = 0.92
) -> bool:
"""
Returns True if a question with > 92% similarity already exists.
"""
if not vector: return False
# Convert similarity threshold to distance
# Cosine Distance = 1 - Cosine Similarity
# If we want Similarity > 0.92, we look for Distance < 0.08
max_distance = 1.0 - threshold
try:
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# We filter by curriculum/concept FIRST for speed
cur.execute(
"""
SELECT 1
FROM mcq_questions
WHERE curriculum = %s
AND concept = %s
AND embedding <=> %s < %s
LIMIT 1;
""",
(curriculum, concept, str(vector), max_distance)
)
return cur.fetchone() is not None
except Exception as e:
logger.error(f"Error checking duplicate: {e}")
return False
def insert_mcqs(self, mcq_list: List[Dict]):
"""
Inserts MCQs and their Embeddings.
"""
if not mcq_list: return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
insert_query = """
INSERT INTO mcq_questions (
curriculum, grade, subject, unit, concept, question_text,
question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
question_image_url, correct_image_url, wrong_image_url_1,
wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint,
embedding
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s
);
"""
data_to_insert = []
for q in mcq_list:
# Handle embedding format
emb = q.get('embedding')
# pgvector expects a list or a string representation of the list
emb_val = str(emb) if emb else None
data_to_insert.append((
q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
q.get('blooms_level'),
q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
q.get('hint'),
emb_val # <--- Pass the vector here
))
cur.executemany(insert_query, data_to_insert)
conn.commit()
logger.info(f"Successfully inserted {len(mcq_list)} MCQs with vectors.")
\ No newline at end of file
# setup_mcq_table.py
import psycopg2
import os
from dotenv import load_dotenv
......@@ -7,9 +5,6 @@ from dotenv import load_dotenv
load_dotenv()
def setup_mcq_table(drop_existing_table: bool = False):
"""
Sets up the mcq_questions table with the final schema, now including blooms_level.
"""
try:
conn = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
......@@ -24,10 +19,13 @@ def setup_mcq_table(drop_existing_table: bool = False):
if drop_existing_table:
print("Dropping existing mcq_questions table...")
cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
print("Table dropped.")
print("Creating mcq_questions table with blooms_level column...")
# --- UPDATED SCHEMA ---
print("Creating mcq_questions table...")
# 1. Enable the vector extension (Just in case)
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
# 2. Create Table with 'embedding vector(1536)'
cur.execute("""
CREATE TABLE IF NOT EXISTS mcq_questions (
id SERIAL PRIMARY KEY,
......@@ -53,24 +51,30 @@ def setup_mcq_table(drop_existing_table: bool = False):
wrong_image_url_3 TEXT,
wrong_image_url_4 TEXT,
hint TEXT,
embedding vector(1536),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("Creating indexes on mcq_questions table...")
# 3. Create HNSW Index for fast vector search
print("Creating vector index...")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_mcq_embedding
ON mcq_questions USING hnsw (embedding vector_cosine_ops);
""")
# Standard indexes
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_mcq_topic
ON mcq_questions(curriculum, grade, is_arabic, subject, unit, concept);
""")
print("MCQ table setup complete.")
print("MCQ table setup complete with Vector support.")
except Exception as e:
print(f"An error occurred during MCQ table setup: {e}")
print(f"Error: {e}")
finally:
if 'conn' in locals() and conn:
conn.close()
print("Database connection closed.")
if 'conn' in locals() and conn: conn.close()
if __name__ == "__main__":
print("Setting up the MCQ table structure...")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment