inegrate pdf uploed

498fa39d · SalmaMohammedHamedMustafa · a558073c · 498fa39d · 498fa39d · 498fa39d
Commit 498fa39d authored Oct 22, 2025 by SalmaMohammedHamedMustafa
10 changed files
--- a/self_hosted_env/docker-compose.yml
+++ b/self_hosted_env/docker-compose.yml
@@ -61,8 +61,13 @@ services:
      DB_HOST: "${DB_HOST}"
      TTS_PROVIDER: "${TTS_PROVIDER}"
      CUSTOM_TTS_URL: "${CUSTOM_TTS_URL}"
+      GEMINI_API_KEY: "${GEMINI_API_KEY}"
      REDIS_HOST: "redis"
      REDIS_PORT: "6379"
+    volumes:
+      - ./voice_agent/embeddings:/app/embeddings
+      - ./voice_agent/All_Curriculums_grouped.json:/app/All_Curriculums_grouped.json
    depends_on:
      - minio
      - postgres
@@ -71,4 +76,4 @@ services:
 volumes:
  pgdata:
  miniodata:
  redisdata:
\ No newline at end of file
--- a/self_hosted_env/voice_agent/Dockerfile
+++ b/self_hosted_env/voice_agent/Dockerfile
@@ -3,8 +3,12 @@ FROM python:3.10-slim
 WORKDIR /app
 # Install postgresql-client for pg_isready
-RUN apt-get update && apt-get install -y \
+RUN apt-get update && apt-get install -y --no-install-recommends \
    postgresql-client \
+    tesseract-ocr \
+    tesseract-ocr-ara \
+    libtesseract-dev \
+    poppler-utils \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies

--- a/self_hosted_env/voice_agent/main.py
+++ b/self_hosted_env/voice_agent/main.py
--- a/self_hosted_env/voice_agent/process_pdf_pipline.py
+++ b/self_hosted_env/voice_agent/process_pdf_pipline.py
--- a/self_hosted_env/voice_agent/requirements.txt
+++ b/self_hosted_env/voice_agent/requirements.txt
@@ -12,4 +12,14 @@ pandas
 python-dotenv
 httpx
 langdetect
 redis
\ No newline at end of file
+pdf2image
+pytesseract
+fuzzywuzzy
+python-Levenshtein
+tqdm
+google-generativeai
+pydantic
+opencv-python-headless
+numpy
+Pillow
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/__init__.py
+++ b/self_hosted_env/voice_agent/services/__init__.py
@@ -9,4 +9,5 @@ from .pgvector_service import PGVectorService
 from .chat_database_service import ChatDatabaseService
 from .connection_pool import ConnectionPool
 from .pedagogy_service import PedagogyService
 from .segmentation_service import LanguageSegmentationService
\ No newline at end of file
+from .data_ingestion_service import DataIngestionService
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/data_ingestion_service.py
+++ b/self_hosted_env/voice_agent/services/data_ingestion_service.py
+import psycopg2
+import pandas as pd
+import json
+from pgvector.psycopg2 import register_vector
+from typing import Dict
+class DataIngestionService:
+    """A service dedicated to inserting new curriculum data into the database."""
+    def __init__(self, pool_handler):
+        self.pool_handler = pool_handler
+    def ingest_curriculum_structure(self, curriculum_json_data: Dict):
+        """
+        Takes parsed JSON data for curriculum structure and inserts it into the DB.
+        This logic is adapted from your curriculum_structure.py script.
+        """
+        print("Inserting curriculum structure data...")
+        # Use the connection pool for thread safety
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor() as cur:
+                for (grade, is_arabic, subject), curriculum in curriculum_json_data.items():
+                    try:
+                        cur.execute(
+                            """
+                            INSERT INTO curriculum_structure (grade, is_arabic, subject, curriculum_data)
+                            VALUES (%s, %s, %s, %s)
+                            ON CONFLICT (grade, is_arabic, subject) 
+                            DO UPDATE SET curriculum_data = EXCLUDED.curriculum_data;
+                            """,
+                            (grade, is_arabic, subject, json.dumps(curriculum))
+                        )
+                        print(f"✅ Ingested structure for Grade {grade} ({'Arabic' if is_arabic else 'English'})")
+                    except Exception as e:
+                        print(f"❌ Error ingesting structure for Grade {grade}: {e}")
+                        conn.rollback() # Rollback on error for this item
+                conn.commit()
+        print("Curriculum structure ingestion complete.")
+    def ingest_embeddings_from_csv(self, df: pd.DataFrame):
+        """
+        Takes a pandas DataFrame of embeddings and inserts it into the DB.
+        This logic is adapted from your insert_csv_embeddings.py script.
+        """
+        print("Inserting embeddings from CSV data...")
+        insert_query = """
+            INSERT INTO educational_chunks
+            (grade, subject, unit, concept, lesson, from_page, to_page, chunk_index, chunk_text, is_arabic, embedding)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+        """
+        records_to_insert = []
+        for _, row in df.iterrows():
+            try:
+                # Assuming the intern's code provides the embedding as a list/string
+                embedding = json.loads(row["Embedding"]) if isinstance(row["Embedding"], str) else row["Embedding"]
+                records_to_insert.append((
+                    int(row["Grade"]), row["Subject"], row.get("Unit"), row.get("Concept"),
+                    row.get("Lesson"), int(row["From page"]), int(row["To page"]),
+                    int(row["Chunk index"]), row["Chunk text"], bool(row["Is Arabic"]),
+                    embedding
+                ))
+            except Exception as e:
+                print(f"Skipping row due to malformed data: {e}")
+        if not records_to_insert:
+            print("No valid records to insert.")
+            return
+        with self.pool_handler.get_connection() as conn:
+            with conn.cursor() as cur:
+                # Use execute_batch for efficient insertion
+                psycopg2.extras.execute_batch(cur, insert_query, records_to_insert)
+                conn.commit()
+        print(f"✅ Ingested {len(records_to_insert)} embedding chunks successfully.")
\ No newline at end of file
--- a/self_hosted_env/voice_agent/services/pgvector_service.py
+++ b/self_hosted_env/voice_agent/services/pgvector_service.py
@@ -483,4 +483,44 @@ class PGVectorService:
                    FROM curriculum_structure
                    ORDER BY grade, is_arabic, subject;
                """)
                return cur.fetchall()
\ No newline at end of file
+    def verify_recent_insertions(self, limit: int = 5):
+        """
+        Fetches and prints the most recently added educational chunks
+        to verify a successful ingestion.
+        """
+        print("\n" + "="*50)
+        print("🔍 Verifying recent embeddings in the database...")
+        print("="*50)
+        try:
+            with self.pool_handler.get_connection() as conn:
+                with conn.cursor(cursor_factory=RealDictCursor) as cur:
+                    # Fetches the 5 rows with the highest 'id' (most recent)
+                    cur.execute(
+                        """
+                        SELECT id, grade, subject, unit, concept, chunk_text, is_arabic
+                        FROM educational_chunks
+                        ORDER BY id DESC
+                        LIMIT %s;
+                        """,
+                        (limit,)
+                    )
+                    results = cur.fetchall()
+                    if not results:
+                        print("❌ No data found in the 'educational_chunks' table.")
+                        return
+                    print(f"✅ Found {len(results)} recent records. Here they are:\n")
+                    for row in results:
+                        print(f"  - ID: {row['id']}, Grade: {row['grade']}, Arabic: {row['is_arabic']}")
+                        print(f"    Unit: {row['unit']}")
+                        print(f"    Concept: {row['concept']}")
+                        print(f"    Text: '{row['chunk_text'][:80]}...'\n")
+            print("="*50)
+        except Exception as e:
+            print(f"❌ Database verification failed: {e}")
\ No newline at end of file
--- a/self_hosted_env/voice_agent/static/curriculum_PDF_uploader.html
+++ b/self_hosted_env/voice_agent/static/curriculum_PDF_uploader.html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Curriculum PDF Uploader</title>
+    <style>
+        body { 
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; 
+            max-width: 700px; 
+            margin: 40px auto; 
+            padding: 20px; 
+            background-color: #f9f9f9; 
+            color: #333;
+            line-height: 1.6;
+        }
+        .container { 
+            background: white; 
+            padding: 30px; 
+            border-radius: 8px; 
+            box-shadow: 0 4px 15px rgba(0,0,0,0.1); 
+        }
+        h1 { 
+            text-align: center; 
+            color: #2c3e50;
+        }
+        input[type="file"] {
+            display: block;
+            margin-bottom: 20px;
+            border: 2px dashed #ccc;
+            padding: 20px;
+            border-radius: 5px;
+            width: 95%;
+            text-align: center;
+            cursor: pointer;
+        }
+        input[type="file"]::file-selector-button {
+            padding: 10px 15px;
+            border-radius: 5px;
+            border: none;
+            background-color: #3498db;
+            color: white;
+            cursor: pointer;
+            transition: background-color 0.2s;
+        }
+        input[type="file"]::file-selector-button:hover {
+            background-color: #2980b9;
+        }
+        button {
+            display: block;
+            width: 100%;
+            padding: 12px;
+            font-size: 16px;
+            font-weight: bold;
+            background: #27ae60;
+            color: white;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            transition: background 0.2s;
+        }
+        button:hover {
+            background: #229954;
+        }
+        button:disabled {
+            background: #95a5a6;
+            cursor: not-allowed;
+        }
+        .status {
+            margin-top: 20px;
+            padding: 15px;
+            border-radius: 5px;
+            font-weight: bold;
+            display: none; /* Hidden by default */
+        }
+        .status.success { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
+        .status.error { background-color: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
+        .status.processing { background-color: #e7f3ff; color: #004085; border: 1px solid #b3d9ff; }
+        pre {
+            background-color: #ecf0f1;
+            padding: 15px;
+            border-radius: 5px;
+            white-space: pre-wrap;
+            word-wrap: break-word;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Curriculum PDF Uploader</h1>
+        <div class="form-group" style="margin-bottom: 20px;">
+            <label for="gradeInput" style="display: block; margin-bottom: 5px; font-weight: bold;">Grade:</label>
+            <input type="number" id="gradeInput" value="4" style="width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;">
+        </div>
+        <div class="form-group" style="margin-bottom: 20px;">
+            <label for="subjectInput" style="display: block; margin-bottom: 5px; font-weight: bold;">Subject:</label>
+            <input type="text" id="subjectInput" value="Science" style="width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;">
+        </div>
+        <input type="file" id="pdfFile" accept=".pdf">
+        <button id="uploadButton">Upload and Process Curriculum</button>
+        <div id="status"></div>
+        <pre id="response" style="display:none;"></pre>
+    </div>
+<script>
+        const API_URL = 'http://localhost:8000/process-curriculum';
+        const pdfFileInput = document.getElementById('pdfFile');
+        const uploadButton = document.getElementById('uploadButton');
+        const statusDiv = document.getElementById('status');
+        const responsePre = document.getElementById('response');
+        const gradeInput = document.getElementById('gradeInput');
+        const subjectInput = document.getElementById('subjectInput'); // <-- Get the new subject field
+        uploadButton.addEventListener('click', async () => {
+            const selectedFile = pdfFileInput.files[0];
+            const grade = gradeInput.value;
+            const subject = subjectInput.value; // <-- Get the subject value
+            // --- Update validation ---
+            if (!selectedFile) { showStatus('Please select a PDF file first.', 'error'); return; }
+            if (!grade) { showStatus('Please enter a grade.', 'error'); return; }
+            if (!subject) { showStatus('Please enter a subject.', 'error'); return; }
+            const formData = new FormData();
+            formData.append('file', selectedFile);
+            formData.append('grade', grade);
+            formData.append('subject', subject);
+            // 3. Update UI to show processing state
+            showStatus('Uploading and starting background processing...', 'processing');
+            uploadButton.disabled = true;
+            responsePre.style.display = 'none';
+            try {
+                // 4. Send the file AND grade to the API
+                const response = await fetch(API_URL, {
+                    method: 'POST',
+                    body: formData,
+                });
+                const responseData = await response.json();
+                // 5. Handle the server's response
+                if (!response.ok) {
+                    throw new Error(responseData.detail || `Server error: ${response.statusText}`);
+                }
+                showStatus('Success! The server has started processing your file in the background.', 'success');
+                responsePre.textContent = JSON.stringify(responseData, null, 2);
+                responsePre.style.display = 'block';
+            } catch (error) {
+                showStatus(`An error occurred: ${error.message}`, 'error');
+            } finally {
+                // 6. Re-enable the button
+                uploadButton.disabled = false;
+            }
+        });
+        // Helper function to show status messages
+        function showStatus(message, type) {
+            statusDiv.textContent = message;
+            statusDiv.className = `status ${type}`;
+            statusDiv.style.display = 'block';
+        }
+    </script>
+</body>
+</html>
\ No newline at end of file
--- a/self_hosted_env/voice_agent/voice_agent.tar
+++ b/self_hosted_env/voice_agent/voice_agent.tar