Delete generate_embeddings_ar.py

7914aadf · Salma Mohammed Hamed · 95189ab5 · 95189ab5
Commit 7914aadf authored Sep 15, 2025 by Salma Mohammed Hamed
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 156 deletions

generate_embeddings_ar.py self_hosted_env/generate_embeddings_ar.py +0 -156

No files found.
--- a/self_hosted_env/generate_embeddings_ar.py
+++ b/self_hosted_env/generate_embeddings_ar.py
-import pandas as pd
-import numpy as np
-import os
-import re
-from openai import OpenAI
-from typing import List
-import csv
-import json
-class EducationalContentProcessor:
-    def __init__(self, api_key: str = None):
-        if api_key is None:
-            api_key = os.getenv('OPENAI_API_KEY')
-        if not api_key:
-            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
-        self.client = OpenAI(api_key=api_key)
-        self.embedding_model = "text-embedding-3-small"
-    def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
-        if not text or pd.isna(text):
-            return [""]
-        text = str(text).strip()
-        if not text:
-            return [""]
-        if is_arabic:
-            sentence_pattern = r'(?<=[.!?؟])\s+'
-        else:
-            sentence_pattern = r'(?<=[.!?])\s+'
-        sentences = re.split(sentence_pattern, text)
-        chunks, current_chunk, current_word_count = [], [], 0
-        for sentence in sentences:
-            sentence_words = len([w for w in sentence.split() if w.strip()])
-            if current_word_count + sentence_words > chunk_size and current_chunk:
-                chunks.append(' '.join(current_chunk))
-                current_chunk = [sentence]
-                current_word_count = sentence_words
-            else:
-                current_chunk.append(sentence)
-                current_word_count += sentence_words
-        if current_chunk:
-            chunks.append(' '.join(current_chunk))
-        return chunks if chunks else [""]
-    def get_embedding(self, text: str) -> List[float]:
-        try:
-            text = str(text).strip()
-            if not text:
-                text = "empty"
-            response = self.client.embeddings.create(
-                model=self.embedding_model,
-                input=text,
-                encoding_format="float"
-            )
-            return response.data[0].embedding
-        except Exception as e:
-            print(f"Error generating embedding: {str(e)}")
-            return [0.0] * 1536
-    def detect_arabic_text(self, text: str) -> bool:
-        if not text or pd.isna(text):
-            return False
-        text = str(text)
-        arabic_chars, total_chars = 0, 0
-        for char in text:
-            if char.strip():
-                total_chars += 1
-                if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
-                   or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
-                   or ('\uFE70' <= char <= '\uFEFF'):
-                    arabic_chars += 1
-        return total_chars > 0 and (arabic_chars / total_chars) > 0.3
-    def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
-        print(f"Reading CSV file: {input_file}")
-        try:
-            df = pd.read_csv(input_file, encoding="utf-8")
-            column_map = {
-                "Unit": ["Unit", "الوحدة"],
-                "Concept": ["Concept", "المفهوم"],
-                "Lesson": ["Lesson", "الدرس"],
-                "From page": ["From page", "من صفحة"],
-                "To page": ["To page", "إلى صفحة"],
-                "Lesson text": ["Lesson text", "النص"]
-            }
-            normalized = {}
-            for std_name, aliases in column_map.items():
-                for alias in aliases:
-                    if alias in df.columns:
-                        normalized[std_name] = df[alias]
-                        break
-                if std_name not in normalized:
-                    normalized[std_name] = ""
-            norm_df = pd.DataFrame(normalized)
-            print(f"Found {len(norm_df)} rows in input file")
-            output_rows = []
-            for idx, row in norm_df.iterrows():
-                print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
-                lesson_text = row["Lesson text"]
-                if is_arabic is None:
-                    text_is_arabic = self.detect_arabic_text(lesson_text)
-                else:
-                    text_is_arabic = is_arabic
-                chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
-                print(f"  Created {len(chunks)} chunks")
-                for chunk_idx, chunk_text in enumerate(chunks):
-                    print(f"  Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
-                    embedding = self.get_embedding(chunk_text)
-                    output_rows.append({
-                        "Grade": grade,
-                        "Subject": subject,
-                        "Unit": row["Unit"],
-                        "Concept": row["Concept"],
-                        "Lesson": row["Lesson"],
-                        "From page": row["From page"],
-                        "To page": row["To page"],
-                        "Chunk index": chunk_idx,
-                        "Chunk text": chunk_text,
-                        "Is Arabic": text_is_arabic,
-                        "Embedding": json.dumps(embedding)
-                    })
-            output_df = pd.DataFrame(output_rows)
-            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
-            print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
-        except Exception as e:
-            print(f"Error processing file: {str(e)}")
-            raise
-def main():
-    processor = EducationalContentProcessor()
-    input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
-    output_file = "prime4_ar_embeddings.csv"
-    processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
-if __name__ == "__main__":
-    main()