Add embedding scripts with Arabic/English support

3d886bb2 · arwa mohamed · 91855fed · 3d886bb2 · 3d886bb2 · 3d886bb2
Commit 3d886bb2 authored Sep 15, 2025 by arwa mohamed
7 changed files
--- a/self_hosted_env/Prime5_en_chunked_with_embeddings.csv
+++ b/self_hosted_env/Prime5_en_chunked_with_embeddings.csv
--- a/self_hosted_env/Prime6_en_chunked_with_embeddings.csv
+++ b/self_hosted_env/Prime6_en_chunked_with_embeddings.csv
--- a/self_hosted_env/generate_embeddings.py
+++ b/self_hosted_env/generate_embeddings.py
+import pandas as pd
+import numpy as np
 import os
-import psycopg2
-import openai
-from psycopg2.extras import execute_values
+import re
+from openai import OpenAI
+from typing import List
+import csv
+import json
 from dotenv import load_dotenv

 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-
-def get_db_connection():
-    return psycopg2.connect(
-        dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
-        user=os.getenv("POSTGRES_USER", "db_admin"),
-        password=os.getenv("POSTGRES_PASSWORD"),
-        host=os.getenv("POSTGRES_HOST", "localhost"),
-        port=os.getenv("POSTGRES_PORT", 5432)
-    )

-def chunk_text(text, chunk_size=500, overlap=50):
+
+class EducationalContentProcessor:
+    def __init__(self, api_key: str = None):
+        if api_key is None:
+            api_key = os.getenv('OPENAI_API_KEY')
+        if not api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
+        
+        self.client = OpenAI(api_key=api_key)
+        self.embedding_model = "text-embedding-3-small"
+
+    def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
+        if not text or pd.isna(text):
+            return [""]
+
+        text = str(text).strip()
+        if not text:
+            return [""]
+
+        sentences = re.split(r'(?<=[.!؟])\s+', text)  
        chunks = []
-    start = 0
-    while start < len(text):
-        end = min(len(text), start + chunk_size)
-        chunks.append(text[start:end])
-        start = end - overlap
-        if start < 0:
-            start = 0
-    return chunks
-
-def get_embedding(text):
-    response = openai.embeddings.create(
-        model="text-embedding-3-small",
-        input=text
+        current_chunk = []
+        current_word_count = 0
+
+        for sentence in sentences:
+            sentence_words = len(sentence.split())
+            if current_word_count + sentence_words > chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = sentence_words
+            else:
+                current_chunk.append(sentence)
+                current_word_count += sentence_words
+
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+
+        return chunks if chunks else [""]
+
+    def get_embedding(self, text: str) -> List[float]:
+        try:
+            text = str(text).strip()
+            if not text:
+                text = "empty"
+            
+            response = self.client.embeddings.create(
+                model=self.embedding_model,
+                input=text,
+                encoding_format="float"
            )
            return response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding: {str(e)}")
+            return [0.0] * 1536  # vector placeholder
+
+    def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
+        print(f"Reading CSV file: {input_file}")
+        
+        try:
+            df = pd.read_csv(input_file)
+
+            column_mapping = {
+                "الوحدة": "Unit",
+                "المفهوم": "Concept",
+                "الدرس": "Lesson",
+                "من صفحة": "From page",
+                "إلى صفحة": "To page",
+                "النص": "Lesson text",
+            }
+            df.rename(columns=column_mapping, inplace=True)
+
+            required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"Missing required columns: {missing_columns}")
+
+            print(f"Found {len(df)} rows in input file")
+            output_rows = []
+
+            for idx, row in df.iterrows():
+                print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
+
+                lesson_text = row['Lesson text']
+                chunks = self.chunk_text(lesson_text, chunk_size)
+                print(f"  Created {len(chunks)} chunks")
+
+                for chunk_idx, chunk_text in enumerate(chunks):
+                    print(f"  Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
+                    embedding = self.get_embedding(chunk_text)
+
+                    output_row = {
+                        'Grade': grade if grade is not None else row.get('Grade', None),
+                        'Subject': subject if subject is not None else row.get('Subject', None),
+                        'Unit': row['Unit'],
+                        'Concept': row['Concept'],
+                        'Lesson': row['Lesson'],
+                        'From page': row['From page'],
+                        'To page': row['To page'],
+                        'Chunk index': chunk_idx,
+                        'Chunk text': chunk_text,
+                        'Is Arabic': False, 
+                        'Embedding': json.dumps(embedding)
+                    }
+                    output_rows.append(output_row)
+
+            print(f"Saving {len(output_rows)} chunks to {output_file}")
+            output_df = pd.DataFrame(output_rows)
+            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
+            print("Processing complete!")
+
+        except Exception as e:
+            print(f"Error processing file: {str(e)}")
+            raise
+

 def main():
-    conn = get_db_connection()
-    cur = conn.cursor()
-
-    print("Fetching lessons...")
-    cur.execute("SELECT id, lesson_text FROM lessons WHERE lesson_text IS NOT NULL;")
-    lessons = cur.fetchall()
-    total_lessons = len(lessons)
-    print(f"Found {total_lessons} lessons to process.")
-
-    all_rows = []
-    for idx, (lesson_id, lesson_text) in enumerate(lessons, start=1):
-        chunks = chunk_text(lesson_text, chunk_size=500, overlap=50)
-        for i, chunk in enumerate(chunks):
-            embedding = get_embedding(chunk)
-            all_rows.append((lesson_id, i, chunk, embedding))
-
-        progress = (idx / total_lessons) * 100
-        print(f"Lesson {idx}/{total_lessons} complete ({progress:.2f}% done, {len(chunks)} chunks)")
-
-        # وقف بعد أول درسين للتجربة
-        if idx == 2:
-            print("Stopping after first 2 lessons (test mode).")
-            break
-
-    if all_rows:
-        query = """
-        INSERT INTO lesson_embeddings (lesson_id, chunk_index, chunk_text, embedding)
-        VALUES %s
-        """
-        execute_values(cur, query, all_rows)
-        conn.commit()
-
-    cur.close()
-    conn.close()
-    print(f"Inserted {len(all_rows)} embeddings into the database.")
+    processor = EducationalContentProcessor()
+    input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
+    output_file = "Prime6_en_chunked_with_embeddings.csv"
+    
+    processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
+

 if __name__ == "__main__":
    main()
--- a/self_hosted_env/generate_embeddings_ar.py
+++ b/self_hosted_env/generate_embeddings_ar.py
+import pandas as pd
+import numpy as np
+import os
+import re
+from openai import OpenAI
+from typing import List
+import csv
+import json
+
+class EducationalContentProcessor:
+    def __init__(self, api_key: str = None):
+        if api_key is None:
+            api_key = os.getenv('OPENAI_API_KEY')
+        
+        if not api_key:
+            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
+        
+        self.client = OpenAI(api_key=api_key)
+        self.embedding_model = "text-embedding-3-small"
+
+    def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
+        if not text or pd.isna(text):
+            return [""]
+
+        text = str(text).strip()
+        if not text:
+            return [""]
+
+        if is_arabic:
+            sentence_pattern = r'(?<=[.!?؟])\s+'
+        else:
+            sentence_pattern = r'(?<=[.!?])\s+'
+
+        sentences = re.split(sentence_pattern, text)
+        
+        chunks, current_chunk, current_word_count = [], [], 0
+        for sentence in sentences:
+            sentence_words = len([w for w in sentence.split() if w.strip()])
+            
+            if current_word_count + sentence_words > chunk_size and current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = sentence_words
+            else:
+                current_chunk.append(sentence)
+                current_word_count += sentence_words
+
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks if chunks else [""]
+
+    def get_embedding(self, text: str) -> List[float]:
+        try:
+            text = str(text).strip()
+            if not text:
+                text = "empty"
+            
+            response = self.client.embeddings.create(
+                model=self.embedding_model,
+                input=text,
+                encoding_format="float"
+            )
+            return response.data[0].embedding
+        except Exception as e:
+            print(f"Error generating embedding: {str(e)}")
+            return [0.0] * 1536
+
+    def detect_arabic_text(self, text: str) -> bool:
+        if not text or pd.isna(text):
+            return False
+
+        text = str(text)
+        arabic_chars, total_chars = 0, 0
+        for char in text:
+            if char.strip():
+                total_chars += 1
+                if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
+                   or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
+                   or ('\uFE70' <= char <= '\uFEFF'):
+                    arabic_chars += 1
+        return total_chars > 0 and (arabic_chars / total_chars) > 0.3
+
+    def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
+        print(f"Reading CSV file: {input_file}")
+        try:
+            df = pd.read_csv(input_file, encoding="utf-8")
+
+            column_map = {
+                "Unit": ["Unit", "الوحدة"],
+                "Concept": ["Concept", "المفهوم"],
+                "Lesson": ["Lesson", "الدرس"],
+                "From page": ["From page", "من صفحة"],
+                "To page": ["To page", "إلى صفحة"],
+                "Lesson text": ["Lesson text", "النص"]
+            }
+
+            normalized = {}
+            for std_name, aliases in column_map.items():
+                for alias in aliases:
+                    if alias in df.columns:
+                        normalized[std_name] = df[alias]
+                        break
+                if std_name not in normalized:
+                    normalized[std_name] = ""
+
+            norm_df = pd.DataFrame(normalized)
+
+            print(f"Found {len(norm_df)} rows in input file")
+
+            output_rows = []
+            for idx, row in norm_df.iterrows():
+                print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
+                lesson_text = row["Lesson text"]
+
+                if is_arabic is None:
+                    text_is_arabic = self.detect_arabic_text(lesson_text)
+                else:
+                    text_is_arabic = is_arabic
+
+                chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
+                print(f"  Created {len(chunks)} chunks")
+
+                for chunk_idx, chunk_text in enumerate(chunks):
+                    print(f"  Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
+                    embedding = self.get_embedding(chunk_text)
+                    output_rows.append({
+                        "Grade": grade,
+                        "Subject": subject,
+                        "Unit": row["Unit"],
+                        "Concept": row["Concept"],
+                        "Lesson": row["Lesson"],
+                        "From page": row["From page"],
+                        "To page": row["To page"],
+                        "Chunk index": chunk_idx,
+                        "Chunk text": chunk_text,
+                        "Is Arabic": text_is_arabic,
+                        "Embedding": json.dumps(embedding)
+                    })
+
+            output_df = pd.DataFrame(output_rows)
+            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
+            print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
+
+        except Exception as e:
+            print(f"Error processing file: {str(e)}")
+            raise
+
+
+def main():
+    processor = EducationalContentProcessor()
+    input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
+    output_file = "prime4_ar_embeddings.csv"
+    processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
+
+if __name__ == "__main__":
+    main()
--- a/self_hosted_env/insert_csv_embeddings.py
+++ b/self_hosted_env/insert_csv_embeddings.py
+import os
+import psycopg2
+import pandas as pd
+import json
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def get_db_connection():
+    return psycopg2.connect(
+        dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
+        user=os.getenv("POSTGRES_USER", "db_admin"),
+        password=os.getenv("POSTGRES_PASSWORD"),
+        host=os.getenv("POSTGRES_HOST", "localhost"),
+        port=os.getenv("POSTGRES_PORT", 5432)
+    )
+
+def insert_chunks_from_csv(csv_file: str):
+    df = pd.read_csv(csv_file)
+
+    required_cols = [
+        "Grade", "Subject", "Unit", "Concept", "Lesson",
+        "From page", "To page", "Chunk index", "Chunk text",
+        "Is Arabic", "Embedding"
+    ]
+
+    for col in required_cols:
+        if col not in df.columns:
+            raise ValueError(f"Missing required column in CSV: {col}")
+
+    conn = get_db_connection()
+    cur = conn.cursor()
+
+    insert_query = """
+        INSERT INTO educational_chunks
+        (grade, subject, unit, concept, lesson,
+         from_page, to_page, chunk_index, chunk_text,
+         is_arabic, embedding)
+        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+    """
+
+    batch_size = 50
+    buffer = []
+
+    for idx, row in df.iterrows():
+        try:
+            embedding = json.loads(row["Embedding"])  # JSON → list
+            buffer.append((
+                row["Grade"],
+                row["Subject"],
+                row.get("Unit"),
+                row.get("Concept"),
+                row.get("Lesson"),
+                int(row["From page"]) if not pd.isna(row["From page"]) else None,
+                int(row["To page"]) if not pd.isna(row["To page"]) else None,
+                int(row["Chunk index"]),
+                row["Chunk text"],
+                bool(row["Is Arabic"]),
+                embedding
+            ))
+        except Exception as e:
+            print(f"Skipping row {idx} due to error: {e}")
+            continue
+
+        if len(buffer) >= batch_size:
+            cur.executemany(insert_query, buffer)
+            conn.commit()
+            print(f"Inserted {len(buffer)} rows...")
+            buffer = []
+
+    if buffer:
+        cur.executemany(insert_query, buffer)
+        conn.commit()
+        print(f"Inserted final {len(buffer)} rows.")
+
+    cur.close()
+    conn.close()
+    print("All data inserted successfully.")
+
+if __name__ == "__main__":
+    csv_file = "Prime6_en_chunked_with_embeddings.csv" 
+    insert_chunks_from_csv(csv_file)
--- a/self_hosted_env/prime4_ar_embeddings.csv
+++ b/self_hosted_env/prime4_ar_embeddings.csv
--- a/self_hosted_env/prime6_ar_embeddings.csv
+++ b/self_hosted_env/prime6_ar_embeddings.csv