inegrate pdf uploed

parent a558073c
......@@ -61,8 +61,13 @@ services:
DB_HOST: "${DB_HOST}"
TTS_PROVIDER: "${TTS_PROVIDER}"
CUSTOM_TTS_URL: "${CUSTOM_TTS_URL}"
GEMINI_API_KEY: "${GEMINI_API_KEY}"
REDIS_HOST: "redis"
REDIS_PORT: "6379"
volumes:
- ./voice_agent/embeddings:/app/embeddings
- ./voice_agent/All_Curriculums_grouped.json:/app/All_Curriculums_grouped.json
depends_on:
- minio
- postgres
......@@ -71,4 +76,4 @@ services:
volumes:
pgdata:
miniodata:
redisdata:
\ No newline at end of file
redisdata:
......@@ -3,8 +3,12 @@ FROM python:3.10-slim
WORKDIR /app
# Install postgresql-client for pg_isready
RUN apt-get update && apt-get install -y \
RUN apt-get update && apt-get install -y --no-install-recommends \
postgresql-client \
tesseract-ocr \
tesseract-ocr-ara \
libtesseract-dev \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
......
This diff is collapsed.
This diff is collapsed.
......@@ -12,4 +12,14 @@ pandas
python-dotenv
httpx
langdetect
redis
\ No newline at end of file
redis
pdf2image
pytesseract
fuzzywuzzy
python-Levenshtein
tqdm
google-generativeai
pydantic
opencv-python-headless
numpy
Pillow
\ No newline at end of file
......@@ -9,4 +9,5 @@ from .pgvector_service import PGVectorService
from .chat_database_service import ChatDatabaseService
from .connection_pool import ConnectionPool
from .pedagogy_service import PedagogyService
from .segmentation_service import LanguageSegmentationService
\ No newline at end of file
from .segmentation_service import LanguageSegmentationService
from .data_ingestion_service import DataIngestionService
\ No newline at end of file
import psycopg2
import pandas as pd
import json
from pgvector.psycopg2 import register_vector
from typing import Dict
class DataIngestionService:
"""A service dedicated to inserting new curriculum data into the database."""
def __init__(self, pool_handler):
self.pool_handler = pool_handler
def ingest_curriculum_structure(self, curriculum_json_data: Dict):
"""
Takes parsed JSON data for curriculum structure and inserts it into the DB.
This logic is adapted from your curriculum_structure.py script.
"""
print("Inserting curriculum structure data...")
# Use the connection pool for thread safety
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
for (grade, is_arabic, subject), curriculum in curriculum_json_data.items():
try:
cur.execute(
"""
INSERT INTO curriculum_structure (grade, is_arabic, subject, curriculum_data)
VALUES (%s, %s, %s, %s)
ON CONFLICT (grade, is_arabic, subject)
DO UPDATE SET curriculum_data = EXCLUDED.curriculum_data;
""",
(grade, is_arabic, subject, json.dumps(curriculum))
)
print(f"✅ Ingested structure for Grade {grade} ({'Arabic' if is_arabic else 'English'})")
except Exception as e:
print(f"❌ Error ingesting structure for Grade {grade}: {e}")
conn.rollback() # Rollback on error for this item
conn.commit()
print("Curriculum structure ingestion complete.")
def ingest_embeddings_from_csv(self, df: pd.DataFrame):
"""
Takes a pandas DataFrame of embeddings and inserts it into the DB.
This logic is adapted from your insert_csv_embeddings.py script.
"""
print("Inserting embeddings from CSV data...")
insert_query = """
INSERT INTO educational_chunks
(grade, subject, unit, concept, lesson, from_page, to_page, chunk_index, chunk_text, is_arabic, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
records_to_insert = []
for _, row in df.iterrows():
try:
# Assuming the intern's code provides the embedding as a list/string
embedding = json.loads(row["Embedding"]) if isinstance(row["Embedding"], str) else row["Embedding"]
records_to_insert.append((
int(row["Grade"]), row["Subject"], row.get("Unit"), row.get("Concept"),
row.get("Lesson"), int(row["From page"]), int(row["To page"]),
int(row["Chunk index"]), row["Chunk text"], bool(row["Is Arabic"]),
embedding
))
except Exception as e:
print(f"Skipping row due to malformed data: {e}")
if not records_to_insert:
print("No valid records to insert.")
return
with self.pool_handler.get_connection() as conn:
with conn.cursor() as cur:
# Use execute_batch for efficient insertion
psycopg2.extras.execute_batch(cur, insert_query, records_to_insert)
conn.commit()
print(f"✅ Ingested {len(records_to_insert)} embedding chunks successfully.")
\ No newline at end of file
......@@ -483,4 +483,44 @@ class PGVectorService:
FROM curriculum_structure
ORDER BY grade, is_arabic, subject;
""")
return cur.fetchall()
\ No newline at end of file
return cur.fetchall()
def verify_recent_insertions(self, limit: int = 5):
"""
Fetches and prints the most recently added educational chunks
to verify a successful ingestion.
"""
print("\n" + "="*50)
print("🔍 Verifying recent embeddings in the database...")
print("="*50)
try:
with self.pool_handler.get_connection() as conn:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Fetches the 5 rows with the highest 'id' (most recent)
cur.execute(
"""
SELECT id, grade, subject, unit, concept, chunk_text, is_arabic
FROM educational_chunks
ORDER BY id DESC
LIMIT %s;
""",
(limit,)
)
results = cur.fetchall()
if not results:
print("❌ No data found in the 'educational_chunks' table.")
return
print(f"✅ Found {len(results)} recent records. Here they are:\n")
for row in results:
print(f" - ID: {row['id']}, Grade: {row['grade']}, Arabic: {row['is_arabic']}")
print(f" Unit: {row['unit']}")
print(f" Concept: {row['concept']}")
print(f" Text: '{row['chunk_text'][:80]}...'\n")
print("="*50)
except Exception as e:
print(f"❌ Database verification failed: {e}")
\ No newline at end of file
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Curriculum PDF Uploader</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
max-width: 700px;
margin: 40px auto;
padding: 20px;
background-color: #f9f9f9;
color: #333;
line-height: 1.6;
}
.container {
background: white;
padding: 30px;
border-radius: 8px;
box-shadow: 0 4px 15px rgba(0,0,0,0.1);
}
h1 {
text-align: center;
color: #2c3e50;
}
input[type="file"] {
display: block;
margin-bottom: 20px;
border: 2px dashed #ccc;
padding: 20px;
border-radius: 5px;
width: 95%;
text-align: center;
cursor: pointer;
}
input[type="file"]::file-selector-button {
padding: 10px 15px;
border-radius: 5px;
border: none;
background-color: #3498db;
color: white;
cursor: pointer;
transition: background-color 0.2s;
}
input[type="file"]::file-selector-button:hover {
background-color: #2980b9;
}
button {
display: block;
width: 100%;
padding: 12px;
font-size: 16px;
font-weight: bold;
background: #27ae60;
color: white;
border: none;
border-radius: 5px;
cursor: pointer;
transition: background 0.2s;
}
button:hover {
background: #229954;
}
button:disabled {
background: #95a5a6;
cursor: not-allowed;
}
.status {
margin-top: 20px;
padding: 15px;
border-radius: 5px;
font-weight: bold;
display: none; /* Hidden by default */
}
.status.success { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
.status.error { background-color: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
.status.processing { background-color: #e7f3ff; color: #004085; border: 1px solid #b3d9ff; }
pre {
background-color: #ecf0f1;
padding: 15px;
border-radius: 5px;
white-space: pre-wrap;
word-wrap: break-word;
}
</style>
</head>
<body>
<div class="container">
<h1>Curriculum PDF Uploader</h1>
<div class="form-group" style="margin-bottom: 20px;">
<label for="gradeInput" style="display: block; margin-bottom: 5px; font-weight: bold;">Grade:</label>
<input type="number" id="gradeInput" value="4" style="width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;">
</div>
<div class="form-group" style="margin-bottom: 20px;">
<label for="subjectInput" style="display: block; margin-bottom: 5px; font-weight: bold;">Subject:</label>
<input type="text" id="subjectInput" value="Science" style="width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;">
</div>
<input type="file" id="pdfFile" accept=".pdf">
<button id="uploadButton">Upload and Process Curriculum</button>
<div id="status"></div>
<pre id="response" style="display:none;"></pre>
</div>
<script>
const API_URL = 'http://localhost:8000/process-curriculum';
const pdfFileInput = document.getElementById('pdfFile');
const uploadButton = document.getElementById('uploadButton');
const statusDiv = document.getElementById('status');
const responsePre = document.getElementById('response');
const gradeInput = document.getElementById('gradeInput');
const subjectInput = document.getElementById('subjectInput'); // <-- Get the new subject field
uploadButton.addEventListener('click', async () => {
const selectedFile = pdfFileInput.files[0];
const grade = gradeInput.value;
const subject = subjectInput.value; // <-- Get the subject value
// --- Update validation ---
if (!selectedFile) { showStatus('Please select a PDF file first.', 'error'); return; }
if (!grade) { showStatus('Please enter a grade.', 'error'); return; }
if (!subject) { showStatus('Please enter a subject.', 'error'); return; }
const formData = new FormData();
formData.append('file', selectedFile);
formData.append('grade', grade);
formData.append('subject', subject);
// 3. Update UI to show processing state
showStatus('Uploading and starting background processing...', 'processing');
uploadButton.disabled = true;
responsePre.style.display = 'none';
try {
// 4. Send the file AND grade to the API
const response = await fetch(API_URL, {
method: 'POST',
body: formData,
});
const responseData = await response.json();
// 5. Handle the server's response
if (!response.ok) {
throw new Error(responseData.detail || `Server error: ${response.statusText}`);
}
showStatus('Success! The server has started processing your file in the background.', 'success');
responsePre.textContent = JSON.stringify(responseData, null, 2);
responsePre.style.display = 'block';
} catch (error) {
showStatus(`An error occurred: ${error.message}`, 'error');
} finally {
// 6. Re-enable the button
uploadButton.disabled = false;
}
});
// Helper function to show status messages
function showStatus(message, type) {
statusDiv.textContent = message;
statusDiv.className = `status ${type}`;
statusDiv.style.display = 'block';
}
</script>
</body>
</html>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment