import psycopg2
import pandas as pd
import json
from pgvector.psycopg2 import register_vector
from typing import Dict

class DataIngestionService:
    """A service dedicated to inserting new curriculum data into the database."""

    def __init__(self, pool_handler):
        self.pool_handler = pool_handler

    def ingest_curriculum_structure(self, curriculum_json_data: Dict):
        """
        Takes parsed JSON data for curriculum structure and inserts it into the DB.
        This logic is adapted from your curriculum_structure.py script.
        """
        print("Inserting curriculum structure data...")
        # Use the connection pool for thread safety
        with self.pool_handler.get_connection() as conn:
            with conn.cursor() as cur:
                for (grade, is_arabic, subject), curriculum in curriculum_json_data.items():
                    try:
                        cur.execute(
                            """
                            INSERT INTO curriculum_structure (grade, is_arabic, subject, curriculum_data)
                            VALUES (%s, %s, %s, %s)
                            ON CONFLICT (grade, is_arabic, subject) 
                            DO UPDATE SET curriculum_data = EXCLUDED.curriculum_data;
                            """,
                            (grade, is_arabic, subject, json.dumps(curriculum))
                        )
                        print(f"✅ Ingested structure for Grade {grade} ({'Arabic' if is_arabic else 'English'})")
                    except Exception as e:
                        print(f"❌ Error ingesting structure for Grade {grade}: {e}")
                        conn.rollback() # Rollback on error for this item
                conn.commit()
        print("Curriculum structure ingestion complete.")

    def ingest_embeddings_from_csv(self, df: pd.DataFrame):
        """
        Takes a pandas DataFrame of embeddings and inserts it into the DB.
        This logic is adapted from your insert_csv_embeddings.py script.
        """
        print("Inserting embeddings from CSV data...")
        insert_query = """
            INSERT INTO educational_chunks
            (grade, subject, unit, concept, lesson, from_page, to_page, chunk_index, chunk_text, is_arabic, embedding)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        
        records_to_insert = []
        for _, row in df.iterrows():
            try:
                # Assuming the intern's code provides the embedding as a list/string
                embedding = json.loads(row["Embedding"]) if isinstance(row["Embedding"], str) else row["Embedding"]
                
                records_to_insert.append((
                    int(row["Grade"]), row["Subject"], row.get("Unit"), row.get("Concept"),
                    row.get("Lesson"), int(row["From page"]), int(row["To page"]),
                    int(row["Chunk index"]), row["Chunk text"], bool(row["Is Arabic"]),
                    embedding
                ))
            except Exception as e:
                print(f"Skipping row due to malformed data: {e}")

        if not records_to_insert:
            print("No valid records to insert.")
            return

        with self.pool_handler.get_connection() as conn:
            with conn.cursor() as cur:
                # Use execute_batch for efficient insertion
                psycopg2.extras.execute_batch(cur, insert_query, records_to_insert)
                conn.commit()
        
        print(f"✅ Ingested {len(records_to_insert)} embedding chunks successfully.")