faster and cheeper mcq generation

67972c15 · salma · ff35d000 · 67972c15 · 67972c15 · 67972c15
Commit 67972c15 authored Dec 04, 2025 by salma
3 changed files
--- a/self_hosted_env/voice_agent/services/mcq_service.py
+++ b/self_hosted_env/voice_agent/services/mcq_service.py
@@ -49,54 +49,100 @@ class MCQService:
        is_arabic: bool, num_questions: int = 5
    ) -> List[Dict]:
        """
-        Generates NEW, UNIQUE MCQs with balanced difficulty and Bloom's taxonomy levels.
-        Each returned question includes:
-        - difficulty_level: 1–10
-        - blooms_level: One of ["remember", "understand", "apply", "analysis", "evaluate", "create"]
+        Generates NEW, UNIQUE MCQs using a 'Generate-Filter-Retry' loop with Vector Semantic Deduplication.
+        This ensures zero redundancy without sending conversation history to the LLM.
        """
-
        if not self.pgvector:
            raise HTTPException(status_code=503, detail="Vector service is not available for context retrieval.")

-        logger.info(f"Checking for existing questions for: {curriculum}/{grade}/{subject}/{unit}/{concept}")
-        existing_questions = self.pgvector.get_mcqs(
-            curriculum, grade, subject, unit, concept, is_arabic, limit=None
-        )
-        existing_questions_text = "No existing questions found."
-        if existing_questions:
-            q_list = [f"- {q['question_text']}" for q in existing_questions]
-            existing_questions_text = "\n".join(q_list)
+        # 1. Retrieve a pool of textbook context chunks (fetched once, rotated in loop)
+        context_chunks = self._retrieve_context_pool(grade, subject, unit, concept, is_arabic)
+        
+        accepted_mcqs = []
+        attempts = 0
+        max_attempts = 3  # Safety brake
+        
+        # 2. Loop until we have enough UNIQUE questions
+        while len(accepted_mcqs) < num_questions and attempts < max_attempts:
+            attempts += 1
+            questions_needed = num_questions - len(accepted_mcqs)
+            
+            # Ask for slightly more than needed to buffer against duplicates being filtered out
+            # (e.g., if we need 1, ask for 2. If we need 5, ask for 7)
+            ask_count = questions_needed + 2
+
+            # A. Context Rotation: Pick random chunks to ensure variety in every loop
+            # This prevents the LLM from fixating on the same part of the text.
+            if len(context_chunks) >= 3:
+                selected_chunks = random.sample(context_chunks, min(len(context_chunks), 4))
+            else:
+                selected_chunks = context_chunks
+            
+            current_context_text = "\n---\n".join([c['chunk_text'] for c in selected_chunks])
+
+            # B. Generate Raw Questions from LLM
+            # (Uses your detailed prompts regarding Bloom's and Difficulty)
+            raw_candidates = self._generate_candidate_batch(
+                current_context_text, ask_count, grade, is_arabic
+            )
+
+            # C. Filter & Embed: Checks DB for semantic duplicates
+            valid_batch = self._filter_and_process_questions(
+                raw_candidates, accepted_mcqs, curriculum, grade, subject, unit, concept, is_arabic
+            )

-        # --- STEP 2: CONTEXT RETRIEVAL ---
+            accepted_mcqs.extend(valid_batch)
+
+        # 3. Store Final Result
+        if accepted_mcqs:
+            # Slice to exact number requested (in case we generated extras)
+            final_set = accepted_mcqs[:num_questions]
+            self.pgvector.insert_mcqs(final_set)
+            return final_set
+        
+        # If we failed to generate enough
+        if not accepted_mcqs:
+            logger.warning(f"Could not generate unique questions for {concept} after {attempts} attempts.")
+            return []
+            
+        return accepted_mcqs
+
+
+    def _retrieve_context_pool(self, grade, subject, unit, concept, is_arabic):
+        """Fetches a larger pool of relevant textbook content to allow for rotation."""
        search_query = f"summary of {concept} in {unit} for {subject}"
-        query_embedding = self.openai_service.generate_embedding(search_query)
+        query_emb = self.openai_service.generate_embedding(search_query)
+        
        try:
-            grade_for_search = self._extract_grade_integer(grade)
-        except ValueError as e:
-            raise HTTPException(status_code=400, detail=str(e))
-        context_chunks = self.pgvector.search_filtered_nearest(
-            query_embedding, grade_for_search, subject, is_arabic, limit=10
+            grade_int = self._extract_grade_integer(grade)
+        except ValueError:
+            grade_int = 0 
+        
+        # Fetch 15 chunks (instead of 10) so we can rotate through them
+        chunks = self.pgvector.search_filtered_nearest(
+            query_emb, grade_int, subject, is_arabic, limit=15
        )
-        if not context_chunks:
-            raise HTTPException(status_code=404, detail="No curriculum context found for this topic.")
-        full_context = "\n---\n".join([chunk['chunk_text'] for chunk in context_chunks])
+        if not chunks:
+            # Fallback text if DB is empty
+            return [{"chunk_text": f"Topic summary: {concept} in {unit}"}]
+        return chunks

-        # --- STEP 3: PROMPT CONSTRUCTION ---
+    def _generate_candidate_batch(self, context_text: str, count: int, grade: str, is_arabic: bool) -> List[Dict]:
+        """
+        Calls LLM to generate questions using the detailed, original prompts 
+        to ensure difficulty balance and Bloom's taxonomy coverage.
+        """
+        
        if is_arabic:
            prompt = f"""
    أنت خبير في تطوير المناهج التعليمية، ومهمتك هي إنشاء **أسئلة اختيار من متعدد جديدة بالكامل**.

-    هذه هي الأسئلة الموجودة بالفعل لمفهوم "{concept}":
-    --- الأسئلة الموجودة ---
-    {existing_questions_text}
-    --- نهاية الأسئلة الموجودة ---
-
-    استنادًا فقط إلى المعلومات التالية:
+    استنادًا فقط إلى المعلومات النصية التالية:
    --- السياق ---
-    {full_context}
+    {context_text}
    --- نهاية السياق ---

-    قم بإنشاء {num_questions} سؤالًا جديدًا تمامًا من نوع الاختيار من متعدد (MCQ)، **مختلفة كليًا عن الأسئلة الموجودة أعلاه**.
+    قم بإنشاء {count} سؤالًا جديدًا تمامًا من نوع الاختيار من متعدد (MCQ).

    ⚠️ **مهم جدًا**:
    يجب أن تشمل الأسئلة مستويات متنوعة من الصعوبة وفق التوزيع التالي:
@@ -127,19 +173,14 @@ class MCQService:
    """
        else:
            prompt = f"""
-    You are an expert curriculum developer. Your task is to generate **entirely new multiple-choice questions (MCQs)** that do NOT overlap with any existing ones.
-
-    Here are the questions that ALREADY EXIST for the concept "{concept}":
-    --- EXISTING QUESTIONS ---
-    {existing_questions_text}
-    --- END EXISTING QUESTIONS ---
+    You are an expert curriculum developer. Your task is to generate **entirely new multiple-choice questions (MCQs)**.

    Based ONLY on the following context:
    --- CONTEXT ---
-    {full_context}
+    {context_text}
    --- END CONTEXT ---

-    Generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions.
+    Generate {count} NEW multiple-choice questions.

    ⚠️ **Important Requirements**:
    - Distribute difficulty levels approximately as follows:
@@ -171,7 +212,6 @@ class MCQService:
    Do not include any text outside the JSON array.
    """

-        # --- STEP 4: CALL LLM ---
        try:
            response = self.openai_service.client.chat.completions.create(
                model=Models.chat,
@@ -182,59 +222,77 @@ class MCQService:
            response_content = response.choices[0].message.content
            json_response = json.loads(response_content)

-            # +++ THIS IS THE NEW, MORE ROBUST PARSING LOGIC +++
-            generated_questions = []
+            # Robust Parsing Logic
            if isinstance(json_response, list):
-                # Case 1: The root of the JSON is already a list of questions.
-                generated_questions = json_response
+                return json_response
            elif isinstance(json_response, dict):
-                # Case 2: The root is a dictionary.
-                # First, try to find a list within the dictionary's values.
+                # Try to find a list inside values
                found_list = next((v for v in json_response.values() if isinstance(v, list)), None)
-                if found_list:
-                    generated_questions = found_list
-                # If no list is found, maybe the dictionary ITSELF is the single question.
-                elif "question_text" in json_response:
-                    generated_questions = [json_response] # Wrap the single object in a list.
+                if found_list: return found_list
+                # Maybe the dict itself is the question
+                if "question_text" in json_response: return [json_response]
            
-            if not generated_questions:
-                # If we still have nothing, the format is truly unknown.
-                raise ValueError("LLM response did not contain a recognizable question list or object.")
-            
-        except (json.JSONDecodeError, ValueError, KeyError, StopIteration) as e:
-            logger.error(f"Failed to parse MCQ response from LLM: {e}\nRaw Response: {response_content}")
-            raise HTTPException(status_code=500, detail="Failed to generate or parse MCQs from AI.")
-
-        # --- STEP 5: STORE ---
-        mcqs_to_store = []
-        for q in generated_questions:
-            mcqs_to_store.append({
-                "curriculum": curriculum,
-                "grade": grade,
-                "subject": subject,
-                "unit": unit,
-                "concept": concept,
-                "is_arabic": is_arabic,
-                "difficulty_level": q.get("difficulty_level"),
-                "blooms_level": q.get("blooms_level"),
-                "question_text": q.get("question_text"),
-                "question_type": q.get("question_type", "multiple_choice"),
-                "correct_answer": q.get("correct_answer"),
-                "wrong_answer_1": q.get("wrong_answer_1"),
-                "wrong_answer_2": q.get("wrong_answer_2"),
-                "wrong_answer_3": q.get("wrong_answer_3"),
-                "wrong_answer_4": q.get("wrong_answer_4"),
-                "hint": q.get("hint"),
-                "question_image_url": q.get("question_image_url", ""),
-                "correct_image_url": q.get("correct_image_url", ""),
-                "wrong_image_url_1": q.get("wrong_image_url_1", ""),
-                "wrong_image_url_2": q.get("wrong_image_url_2", ""),
-                "wrong_image_url_3": q.get("wrong_image_url_3", ""),
-                "wrong_image_url_4": q.get("wrong_image_url_4", ""),
-            })
-
-        self.pgvector.insert_mcqs(mcqs_to_store)
-        return mcqs_to_store
+            return []
+
+        except Exception as e:
+            logger.error(f"Failed to generate batch from LLM: {e}")
+            return []
+
+    def _filter_and_process_questions(
+        self, candidates: List[Dict], current_batch: List[Dict],
+        curriculum, grade, subject, unit, concept, is_arabic
+    ) -> List[Dict]:
+        """
+        1. Generates Embedding for new question.
+        2. Checks DB if similar vector exists.
+        3. Returns only unique questions with embedding attached.
+        """
+        valid = []
+        for q in candidates:
+            q_text = q.get("question_text", "").strip()
+            if not q_text: continue
+
+            # 1. Local check (prevent duplicates within the current generation loop)
+            # Check against questions accepted in previous loops + valid ones in this loop
+            if any(x['question_text'] == q_text for x in current_batch + valid):
+                continue
+
+            try:
+                # 2. Generate Embedding (This creates the vector)
+                q_vector = self.openai_service.generate_embedding(q_text)
+                
+                # 3. Check DB for semantic duplicate using the vector
+                is_duplicate = self.pgvector.check_similarity_existence(
+                    vector=q_vector, 
+                    curriculum=curriculum, 
+                    concept=concept, 
+                    threshold=0.92 # 92% similarity means it's a duplicate
+                )
+                
+                if is_duplicate:
+                    logger.info(f"Vector duplicate filtered: {q_text[:30]}...")
+                    continue
+
+                # 4. Add Metadata & Vector to the object for storage
+                q.update({
+                    "curriculum": curriculum, "grade": grade, "subject": subject,
+                    "unit": unit, "concept": concept, "is_arabic": is_arabic,
+                    "embedding": q_vector, # Store vector!
+                    "question_type": q.get("question_type", "multiple_choice"),
+                    "question_image_url": q.get("question_image_url", ""),
+                    "correct_image_url": q.get("correct_image_url", ""),
+                    "wrong_image_url_1": q.get("wrong_image_url_1", ""),
+                    "wrong_image_url_2": q.get("wrong_image_url_2", ""),
+                    "wrong_image_url_3": q.get("wrong_image_url_3", ""),
+                    "wrong_image_url_4": q.get("wrong_image_url_4", ""),
+                })
+                valid.append(q)
+
+            except Exception as e:
+                logger.error(f"Vector check failed for question '{q_text[:20]}': {e}")
+                continue
+                
+        return valid


    
@@ -297,7 +355,6 @@ class MCQService:
            logger.warning(f"All questions for '{concept}' have been asked recently. Re-using full list.")
            unasked_mcqs = all_mcqs

-        # --- THIS IS THE ROBUST TWO-STEP SELECTION LOGIC ---

        # 4. STEP 1 (Filter with AI): Get a SUBSET of relevant questions.
        relevant_question_texts = []

--- a/self_hosted_env/voice_agent/services/pgvector_service.py
+++ b/self_hosted_env/voice_agent/services/pgvector_service.py
@@ -543,45 +543,7 @@ class PGVectorService:
        except Exception as e:
            print(f"❌ Database verification failed: {e}")

-    def insert_mcqs(self, mcq_list: List[Dict]):
-        """
-        Inserts a batch of MCQs, now including the blooms_level field.
-        """
-        if not mcq_list:
-            return

-        with self.pool_handler.get_connection() as conn:
-            with conn.cursor() as cur:
-                # --- UPDATED INSERT QUERY ---
-                insert_query = """
-                    INSERT INTO mcq_questions (
-                        curriculum, grade, subject, unit, concept, question_text,
-                        question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
-                        wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
-                        question_image_url, correct_image_url, wrong_image_url_1,
-                        wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint
-                    ) VALUES (
-                        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
-                        %s, %s, %s, %s, %s, %s, %s
-                    );
-                """
-                # --- UPDATED DATA PREPARATION ---
-                data_to_insert = [
-                    (
-                        q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
-                        q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
-                        q.get('blooms_level'), # <-- ADDED THIS
-                        q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
-                        q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
-                        q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
-                        q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
-                        q.get('hint')
-                    ) for q in mcq_list
-                ]
-                
-                cur.executemany(insert_query, data_to_insert)
-                conn.commit()
-                logger.info(f"Successfully inserted {len(mcq_list)} MCQs into the database.")

    def get_mcqs(self, curriculum: str, grade: str, subject: str, unit: str, concept: str, is_arabic: bool, limit: Optional[int] = 10) -> List[Dict]:
        """
@@ -668,4 +630,87 @@ class PGVectorService:
                      AND u->>'name' = %s
                    ORDER BY 1;
                """, (curriculum, int(grade), subject, unit))
-                return [row[0] for row in cur.fetchall() if row[0]]
\ No newline at end of file
+                return [row[0] for row in cur.fetchall() if row[0]]
+            
+
+    def check_similarity_existence(
+        self, 
+        vector: List[float], 
+        curriculum: str, 
+        concept: str, 
+        threshold: float = 0.92
+    ) -> bool:
+        """
+        Returns True if a question with > 92% similarity already exists.
+        """
+        if not vector: return False
+
+        # Convert similarity threshold to distance
+        # Cosine Distance = 1 - Cosine Similarity
+        # If we want Similarity > 0.92, we look for Distance < 0.08
+        max_distance = 1.0 - threshold
+
+        try:
+            with self.pool_handler.get_connection() as conn:
+                with conn.cursor() as cur:
+                    # We filter by curriculum/concept FIRST for speed
+                    cur.execute(
+                        """
+                        SELECT 1 
+                        FROM mcq_questions 
+                        WHERE curriculum = %s 
+                          AND concept = %s 
+                          AND embedding <=> %s < %s
+                        LIMIT 1;
+                        """,
+                        (curriculum, concept, str(vector), max_distance)
+                    )
+                    return cur.fetchone() is not None
+        except Exception as e:
+            logger.error(f"Error checking duplicate: {e}")
+            return False
+
+    def insert_mcqs(self, mcq_list: List[Dict]):
+        """
+        Inserts MCQs and their Embeddings.
+        """
+        if not mcq_list: return
+
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor() as cur:
+                insert_query = """
+                    INSERT INTO mcq_questions (
+                        curriculum, grade, subject, unit, concept, question_text,
+                        question_type, difficulty_level, blooms_level, is_arabic, correct_answer,
+                        wrong_answer_1, wrong_answer_2, wrong_answer_3, wrong_answer_4,
+                        question_image_url, correct_image_url, wrong_image_url_1,
+                        wrong_image_url_2, wrong_image_url_3, wrong_image_url_4, hint,
+                        embedding 
+                    ) VALUES (
+                        %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
+                        %s, %s, %s, %s, %s, %s, %s, %s
+                    );
+                """
+                
+                data_to_insert = []
+                for q in mcq_list:
+                    # Handle embedding format
+                    emb = q.get('embedding')
+                    # pgvector expects a list or a string representation of the list
+                    emb_val = str(emb) if emb else None
+
+                    data_to_insert.append((
+                        q.get('curriculum'), q.get('grade'), q.get('subject'), q.get('unit'), q.get('concept'),
+                        q.get('question_text'), q.get('question_type'), q.get('difficulty_level'),
+                        q.get('blooms_level'),
+                        q.get('is_arabic'), q.get('correct_answer'), q.get('wrong_answer_1'),
+                        q.get('wrong_answer_2'), q.get('wrong_answer_3'), q.get('wrong_answer_4'),
+                        q.get('question_image_url'), q.get('correct_image_url'), q.get('wrong_image_url_1'),
+                        q.get('wrong_image_url_2'), q.get('wrong_image_url_3'), q.get('wrong_image_url_4'),
+                        q.get('hint'),
+                        emb_val # <--- Pass the vector here
+                    ))
+                
+                cur.executemany(insert_query, data_to_insert)
+                conn.commit()
+                logger.info(f"Successfully inserted {len(mcq_list)} MCQs with vectors.")
\ No newline at end of file
--- a/self_hosted_env/voice_agent/setup_mcq_table.py
+++ b/self_hosted_env/voice_agent/setup_mcq_table.py
-# setup_mcq_table.py
-
 import psycopg2
 import os
 from dotenv import load_dotenv
@@ -7,9 +5,6 @@ from dotenv import load_dotenv
 load_dotenv()

 def setup_mcq_table(drop_existing_table: bool = False):
-    """
-    Sets up the mcq_questions table with the final schema, now including blooms_level.
-    """
    try:
        conn = psycopg2.connect(
            host=os.getenv("POSTGRES_HOST", "localhost"),
@@ -24,10 +19,13 @@ def setup_mcq_table(drop_existing_table: bool = False):
            if drop_existing_table:
                print("Dropping existing mcq_questions table...")
                cur.execute("DROP TABLE IF EXISTS mcq_questions CASCADE;")
-                print("Table dropped.")

-            print("Creating mcq_questions table with blooms_level column...")
-            # --- UPDATED SCHEMA ---
+            print("Creating mcq_questions table...")
+            
+            # 1. Enable the vector extension (Just in case)
+            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+
+            # 2. Create Table with 'embedding vector(1536)'
            cur.execute("""
                CREATE TABLE IF NOT EXISTS mcq_questions (
                    id SERIAL PRIMARY KEY,
@@ -53,24 +51,30 @@ def setup_mcq_table(drop_existing_table: bool = False):
                    wrong_image_url_3 TEXT,
                    wrong_image_url_4 TEXT,
                    hint TEXT,
+                    embedding vector(1536),
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
            """)
            
-            print("Creating indexes on mcq_questions table...")
+            # 3. Create HNSW Index for fast vector search
+            print("Creating vector index...")
+            cur.execute("""
+                CREATE INDEX IF NOT EXISTS idx_mcq_embedding 
+                ON mcq_questions USING hnsw (embedding vector_cosine_ops);
+            """)
+
+            # Standard indexes
            cur.execute("""
                CREATE INDEX IF NOT EXISTS idx_mcq_topic 
                ON mcq_questions(curriculum, grade, is_arabic, subject, unit, concept);
            """)

-            print("MCQ table setup complete.")
+            print("MCQ table setup complete with Vector support.")
                
    except Exception as e:
-        print(f"An error occurred during MCQ table setup: {e}")
+        print(f"Error: {e}")
    finally:
-        if 'conn' in locals() and conn:
-            conn.close()
-            print("Database connection closed.")
+        if 'conn' in locals() and conn: conn.close()

 if __name__ == "__main__":
    print("Setting up the MCQ table structure...")