Merge branch 'add_extracted_data' into 'master'

Add extracted data See merge request !3

Merge branch 'add_extracted_data' into 'master'
Add extracted data See merge request !3
6d1a52f6 · Salma Mohammed Hamed · 3e935489 · 3d886bb2 · 6d1a52f6 · 6d1a52f6
Commit 6d1a52f6 authored Sep 15, 2025 by Salma Mohammed Hamed
8 changed files
--- a/self_hosted_env/Prime5_en_chunked_with_embeddings.csv
+++ b/self_hosted_env/Prime5_en_chunked_with_embeddings.csv
--- a/self_hosted_env/Prime6_en_chunked_with_embeddings.csv
+++ b/self_hosted_env/Prime6_en_chunked_with_embeddings.csv
--- a/self_hosted_env/generate_embeddings.py
+++ b/self_hosted_env/generate_embeddings.py
+import pandas as pd
+import numpy as np
+import os
+import re
+from openai import OpenAI
+from typing import List
+import csv
+import json
+from dotenv import load_dotenv
+load_dotenv()
+class EducationalContentProcessor:
+    def __init__(self, api_key: str = None):
+        if api_key is None:
+            api_key = os.getenv('OPENAI_API_KEY')
+        if not api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
+        self.client = OpenAI(api_key=api_key)
+        self.embedding_model = "text-embedding-3-small"
+    def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
+        if not text or pd.isna(text):
+            return [""]
+        text = str(text).strip()
+        if not text:
+            return [""]
+        sentences = re.split(r'(?<=[.!؟])\s+', text)  
+        chunks = []
+        current_chunk = []
+        current_word_count = 0
+        for sentence in sentences:
+            sentence_words = len(sentence.split())
+            if current_word_count + sentence_words > chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = sentence_words
+            else:
+                current_chunk.append(sentence)
+                current_word_count += sentence_words
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks if chunks else [""]
+    def get_embedding(self, text: str) -> List[float]:
+        try:
+            text = str(text).strip()
+            if not text:
+                text = "empty"
+            response = self.client.embeddings.create(
+                model=self.embedding_model,
+                input=text,
+                encoding_format="float"
+            )
+            return response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding: {str(e)}")
+            return [0.0] * 1536  # vector placeholder
+    def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
+        print(f"Reading CSV file: {input_file}")
+        try:
+            df = pd.read_csv(input_file)
+            column_mapping = {
+                "الوحدة": "Unit",
+                "المفهوم": "Concept",
+                "الدرس": "Lesson",
+                "من صفحة": "From page",
+                "إلى صفحة": "To page",
+                "النص": "Lesson text",
+            }
+            df.rename(columns=column_mapping, inplace=True)
+            required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"Missing required columns: {missing_columns}")
+            print(f"Found {len(df)} rows in input file")
+            output_rows = []
+            for idx, row in df.iterrows():
+                print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
+                lesson_text = row['Lesson text']
+                chunks = self.chunk_text(lesson_text, chunk_size)
+                print(f"  Created {len(chunks)} chunks")
+                for chunk_idx, chunk_text in enumerate(chunks):
+                    print(f"  Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
+                    embedding = self.get_embedding(chunk_text)
+                    output_row = {
+                        'Grade': grade if grade is not None else row.get('Grade', None),
+                        'Subject': subject if subject is not None else row.get('Subject', None),
+                        'Unit': row['Unit'],
+                        'Concept': row['Concept'],
+                        'Lesson': row['Lesson'],
+                        'From page': row['From page'],
+                        'To page': row['To page'],
+                        'Chunk index': chunk_idx,
+                        'Chunk text': chunk_text,
+                        'Is Arabic': False, 
+                        'Embedding': json.dumps(embedding)
+                    }
+                    output_rows.append(output_row)
+            print(f"Saving {len(output_rows)} chunks to {output_file}")
+            output_df = pd.DataFrame(output_rows)
+            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
+            print("Processing complete!")
+        except Exception as e:
+            print(f"Error processing file: {str(e)}")
+            raise
+def main():
+    processor = EducationalContentProcessor()
+    input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
+    output_file = "Prime6_en_chunked_with_embeddings.csv"
+    processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
+if __name__ == "__main__":
+    main()
--- a/self_hosted_env/generate_embeddings_ar.py
+++ b/self_hosted_env/generate_embeddings_ar.py
+import pandas as pd
+import numpy as np
+import os
+import re
+from openai import OpenAI
+from typing import List
+import csv
+import json
+class EducationalContentProcessor:
+    def __init__(self, api_key: str = None):
+        if api_key is None:
+            api_key = os.getenv('OPENAI_API_KEY')
+        if not api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
+        self.client = OpenAI(api_key=api_key)
+        self.embedding_model = "text-embedding-3-small"
+    def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
+        if not text or pd.isna(text):
+            return [""]
+        text = str(text).strip()
+        if not text:
+            return [""]
+        if is_arabic:
+            sentence_pattern = r'(?<=[.!?؟])\s+'
+        else:
+            sentence_pattern = r'(?<=[.!?])\s+'
+        sentences = re.split(sentence_pattern, text)
+        chunks, current_chunk, current_word_count = [], [], 0
+        for sentence in sentences:
+            sentence_words = len([w for w in sentence.split() if w.strip()])
+            if current_word_count + sentence_words > chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = sentence_words
+            else:
+                current_chunk.append(sentence)
+                current_word_count += sentence_words
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks if chunks else [""]
+    def get_embedding(self, text: str) -> List[float]:
+        try:
+            text = str(text).strip()
+            if not text:
+                text = "empty"
+            response = self.client.embeddings.create(
+                model=self.embedding_model,
+                input=text,
+                encoding_format="float"
+            )
+            return response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding: {str(e)}")
+            return [0.0] * 1536
+    def detect_arabic_text(self, text: str) -> bool:
+        if not text or pd.isna(text):
+            return False
+        text = str(text)
+        arabic_chars, total_chars = 0, 0
+        for char in text:
+            if char.strip():
+                total_chars += 1
+                if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
+                   or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
+                   or ('\uFE70' <= char <= '\uFEFF'):
+                    arabic_chars += 1
+        return total_chars > 0 and (arabic_chars / total_chars) > 0.3
+    def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
+        print(f"Reading CSV file: {input_file}")
+        try:
+            df = pd.read_csv(input_file, encoding="utf-8")
+            column_map = {
+                "Unit": ["Unit", "الوحدة"],
+                "Concept": ["Concept", "المفهوم"],
+                "Lesson": ["Lesson", "الدرس"],
+                "From page": ["From page", "من صفحة"],
+                "To page": ["To page", "إلى صفحة"],
+                "Lesson text": ["Lesson text", "النص"]
+            }
+            normalized = {}
+            for std_name, aliases in column_map.items():
+                for alias in aliases:
+                    if alias in df.columns:
+                        normalized[std_name] = df[alias]
+                        break
+                if std_name not in normalized:
+                    normalized[std_name] = ""
+            norm_df = pd.DataFrame(normalized)
+            print(f"Found {len(norm_df)} rows in input file")
+            output_rows = []
+            for idx, row in norm_df.iterrows():
+                print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
+                lesson_text = row["Lesson text"]
+                if is_arabic is None:
+                    text_is_arabic = self.detect_arabic_text(lesson_text)
+                else:
+                    text_is_arabic = is_arabic
+                chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
+                print(f"  Created {len(chunks)} chunks")
+                for chunk_idx, chunk_text in enumerate(chunks):
+                    print(f"  Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
+                    embedding = self.get_embedding(chunk_text)
+                    output_rows.append({
+                        "Grade": grade,
+                        "Subject": subject,
+                        "Unit": row["Unit"],
+                        "Concept": row["Concept"],
+                        "Lesson": row["Lesson"],
+                        "From page": row["From page"],
+                        "To page": row["To page"],
+                        "Chunk index": chunk_idx,
+                        "Chunk text": chunk_text,
+                        "Is Arabic": text_is_arabic,
+                        "Embedding": json.dumps(embedding)
+                    })
+            output_df = pd.DataFrame(output_rows)
+            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
+            print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
+        except Exception as e:
+            print(f"Error processing file: {str(e)}")
+            raise
+def main():
+    processor = EducationalContentProcessor()
+    input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
+    output_file = "prime4_ar_embeddings.csv"
+    processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
+if __name__ == "__main__":
+    main()
--- a/self_hosted_env/insert_csv_embeddings.py
+++ b/self_hosted_env/insert_csv_embeddings.py
+import os
+import psycopg2
+import pandas as pd
+import json
+from dotenv import load_dotenv
+load_dotenv()
+def get_db_connection():
+    return psycopg2.connect(
+        dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
+        user=os.getenv("POSTGRES_USER", "db_admin"),
+        password=os.getenv("POSTGRES_PASSWORD"),
+        host=os.getenv("POSTGRES_HOST", "localhost"),
+        port=os.getenv("POSTGRES_PORT", 5432)
+    )
+def insert_chunks_from_csv(csv_file: str):
+    df = pd.read_csv(csv_file)
+    required_cols = [
+        "Grade", "Subject", "Unit", "Concept", "Lesson",
+        "From page", "To page", "Chunk index", "Chunk text",
+        "Is Arabic", "Embedding"
+    ]
+    for col in required_cols:
+        if col not in df.columns:
+            raise ValueError(f"Missing required column in CSV: {col}")
+    conn = get_db_connection()
+    cur = conn.cursor()
+    insert_query = """
+        INSERT INTO educational_chunks
+        (grade, subject, unit, concept, lesson,
+         from_page, to_page, chunk_index, chunk_text,
+         is_arabic, embedding)
+        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+    """
+    batch_size = 50
+    buffer = []
+    for idx, row in df.iterrows():
+        try:
+            embedding = json.loads(row["Embedding"])  # JSON → list
+            buffer.append((
+                row["Grade"],
+                row["Subject"],
+                row.get("Unit"),
+                row.get("Concept"),
+                row.get("Lesson"),
+                int(row["From page"]) if not pd.isna(row["From page"]) else None,
+                int(row["To page"]) if not pd.isna(row["To page"]) else None,
+                int(row["Chunk index"]),
+                row["Chunk text"],
+                bool(row["Is Arabic"]),
+                embedding
+            ))
+        except Exception as e:
+            print(f"Skipping row {idx} due to error: {e}")
+            continue
+        if len(buffer) >= batch_size:
+            cur.executemany(insert_query, buffer)
+            conn.commit()
+            print(f"Inserted {len(buffer)} rows...")
+            buffer = []
+    if buffer:
+        cur.executemany(insert_query, buffer)
+        conn.commit()
+        print(f"Inserted final {len(buffer)} rows.")
+    cur.close()
+    conn.close()
+    print("All data inserted successfully.")
+if __name__ == "__main__":
+    csv_file = "Prime6_en_chunked_with_embeddings.csv" 
+    insert_chunks_from_csv(csv_file)
--- a/self_hosted_env/insert_lessons.py
+++ b/self_hosted_env/insert_lessons.py
+import os
+import psycopg2
+import pandas as pd
+from psycopg2.extras import execute_values
+from dotenv import load_dotenv
+load_dotenv()
+def get_db_connection():
+    return psycopg2.connect(
+        dbname=os.getenv("POSTGRES_DB","embeddings_db"),
+        user=os.getenv("POSTGRES_USER","db_admin"),
+        password=os.getenv("POSTGRES_PASSWORD"),
+        host=os.getenv("POSTGRES_HOST", "localhost"),
+        port=os.getenv("POSTGRES_PORT", 5432)
+    )
+def insert_lessons_from_csv(file_path, conn, grade, subject):
+    df = pd.read_csv(file_path)
+    df.rename(columns={
+        "الوحدة": "Unit",
+        "المفهوم": "Concept",
+        "الدرس": "Lesson",
+        "من صفحة": "From page",
+        "إلى صفحة": "To page",
+        "النص": "Lesson text"
+    }, inplace=True)
+    required_columns = ["Unit", "Concept", "Lesson", "From page", "To page", "Lesson text"]
+    missing = [col for col in required_columns if col not in df.columns]
+    if missing:
+        print(f"⚠️ Missing columns in {file_path}: {', '.join(missing)}")
+        return
+    rows = []
+    for _, row in df.iterrows():
+        rows.append((
+            grade,
+            subject,
+            row["Unit"],
+            row["Concept"],
+            row["Lesson"],
+            row["From page"],
+            row["To page"],
+            row["Lesson text"]
+        ))
+    query = """
+        INSERT INTO lessons (grade, subject, unit, concept, lesson, start_page, end_page, lesson_text)
+        VALUES %s
+    """
+    with conn.cursor() as cur:
+        execute_values(cur, query, rows)
+    conn.commit()
+    print(f" Inserted {len(rows)} rows from {os.path.basename(file_path)}")
+def main():
+    folder = input("Enter the path to the folder containing CSV files: ").strip()
+    if not os.path.exists(folder):
+        print("Folder not found.")
+        return
+    files = [f for f in os.listdir(folder) if f.endswith(".csv")]
+    if not files:
+        print(" No CSV files found.")
+        return
+    print("Available files:")
+    for i, f in enumerate(files, 1):
+        print(f"{i}. {f}")
+    selected = input("Enter the numbers of the files you want to import (e.g., 1 3 4): ").split()
+    selected_files = [files[int(i) - 1] for i in selected]
+    grade = input("Enter grade manually (e.g., Grade 5): ").strip()
+    subject = input("Enter subject manually (default: Science): ").strip() or "Science"
+    conn = get_db_connection()
+    try:
+        for f in selected_files:
+            file_path = os.path.join(folder, f)
+            insert_lessons_from_csv(file_path, conn, grade, subject)
+        with conn.cursor() as cur:
+            cur.execute("SELECT COUNT(*) FROM lessons;")
+            total = cur.fetchone()[0]
+            print(f" Total rows in lessons table: {total}")
+    finally:
+        conn.close()
+        print(" Connection closed.")
+if __name__ == "__main__":
+    main()
--- a/self_hosted_env/prime4_ar_embeddings.csv
+++ b/self_hosted_env/prime4_ar_embeddings.csv
--- a/self_hosted_env/prime6_ar_embeddings.csv
+++ b/self_hosted_env/prime6_ar_embeddings.csv