Commit 67972c15 authored by salma's avatar salma

faster and cheeper mcq generation

parent ff35d000
......@@ -543,45 +543,7 @@ class PGVectorService:
except Exception as e:
print(f"❌ Database verification failed: {e}")
def insert_mcqs(self, mcq_list: List[Dict]):
"""
Inserts a batch of MCQs, now including the blooms_level field.
"""
if not mcq_list:
return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# --- UPDATED INSERT QUERY ---
insert_query = """
INSERT INTO mcq_questions (
curriculum, grade, subject, unit, concept, question_text,
question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
question_image_url, correct_image_url, wrong_image_url_1,
wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s
);
"""
# --- UPDATED DATA PREPARATION ---
data_to_insert = [
(
q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
q.get('blooms_level'), # <-- ADDED THIS
q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
q.get('hint')
) for q in mcq_list
]
cur.executemany(insert_query, data_to_insert)
conn.commit()
logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")
def get_mcqs(self, curriculum: str, grade: str, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
"""
......@@ -668,4 +630,87 @@ class PGVectorService:
AND u->>'name' = %s
ORDER BY 1;
""", (curriculum, int(grade), subject, unit))
return [row[0] for row in cur.fetchall() if row[0]]
\ No newline at end of file
return [row[0] for row in cur.fetchall() if row[0]]
def check_similarity_existence(
self,
vector: List[float],
curriculum: str,
concept: str,
threshold: float = 0.92
) -> bool:
"""
Returns True if a question with > 92% similarity already exists.
"""
if not vector: return False
# Convert similarity threshold to distance
# Cosine Distance = 1 - Cosine Similarity
# If we want Similarity > 0.92, we look for Distance < 0.08
max_distance = 1.0 - threshold
try:
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# We filter by curriculum/concept FIRST for speed
cur.execute(
"""
SELECT 1
FROM mcq_questions
WHERE curriculum = %s
AND concept = %s
AND embedding <=> %s < %s
LIMIT 1;
""",
(curriculum, concept, str(vector), max_distance)
)
return cur.fetchone() is not None
except Exception as e:
logger.error(f"Error checking duplicate: {e}")
return False
def insert_mcqs(self, mcq_list: List[Dict]):
"""
Inserts MCQs and their Embeddings.
"""
if not mcq_list: return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
insert_query = """
INSERT INTO mcq_questions (
curriculum, grade, subject, unit, concept, question_text,
question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
question_image_url, correct_image_url, wrong_image_url_1,
wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint,
embedding
) VALUES (
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s
);
"""
data_to_insert = []
for q in mcq_list:
# Handle embedding format
emb = q.get('embedding')
# pgvector expects a list or a string representation of the list
emb_val = str(emb) if emb else None
data_to_insert.append((
q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
q.get('blooms_level'),
q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
q.get('hint'),
emb_val # <--- Pass the vector here
))
cur.executemany(insert_query, data_to_insert)
conn.commit()
logger.info(f"Successfully inserted {len(mcq_list)} MCQs with vectors.")
\ No newline at end of file
# setup_mcq_table.py
import psycopg2
import os
from dotenv import load_dotenv
......@@ -7,9 +5,6 @@ from dotenv import load_dotenv
load_dotenv()
def setup_mcq_table(drop_existing_table: bool = False):
"""
Sets up the mcq_questions table with the final schema, now including blooms_level.
"""
try:
conn = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
......@@ -24,10 +19,13 @@ def setup_mcq_table(drop_existing_table: bool = False):
if drop_existing_table:
print("Dropping existing mcq_questions table...")
cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
print("Table dropped.")
print("Creating mcq_questions table with blooms_level column...")
# --- UPDATED SCHEMA ---
print("Creating mcq_questions table...")
# 1. Enable the vector extension (Just in case)
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
# 2. Create Table with 'embedding vector(1536)'
cur.execute("""
CREATE TABLE IF NOT EXISTS mcq_questions (
id SERIAL PRIMARY KEY,
......@@ -53,24 +51,30 @@ def setup_mcq_table(drop_existing_table: bool = False):
wrong_image_url_3 TEXT,
wrong_image_url_4 TEXT,
hint TEXT,
embedding vector(1536),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("Creating indexes on mcq_questions table...")
# 3. Create HNSW Index for fast vector search
print("Creating vector index...")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_mcq_embedding
ON mcq_questions USING hnsw (embedding vector_cosine_ops);
""")
# Standard indexes
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_mcq_topic
ON mcq_questions(curriculum, grade, is_arabic, subject, unit, concept);
""")
print("MCQ table setup complete.")
print("MCQ table setup complete with Vector support.")
except Exception as e:
print(f"An error occurred during MCQ table setup: {e}")
print(f"Error: {e}")
finally:
if 'conn' in locals() and conn:
conn.close()
print("Database connection closed.")
if 'conn' in locals() and conn: conn.close()
if __name__ == "__main__":
print("Setting up the MCQ table structure...")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment