Delete generate_embeddings.py

95189ab5 · Salma Mohammed Hamed · 7446c2b4 · 7446c2b4
Commit 95189ab5 authored Sep 15, 2025 by Salma Mohammed Hamed
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 137 deletions

generate_embeddings.py self_hosted_env/generate_embeddings.py +0 -137

No files found.
--- a/self_hosted_env/generate_embeddings.py
+++ b/self_hosted_env/generate_embeddings.py
-import pandas as pd
-import numpy as np
-import os
-import re
-from openai import OpenAI
-from typing import List
-import csv
-import json
-from dotenv import load_dotenv
-load_dotenv()
-class EducationalContentProcessor:
-    def __init__(self, api_key: str = None):
-        if api_key is None:
-            api_key = os.getenv('OPENAI_API_KEY')
-        if not api_key:
-            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
-        self.client = OpenAI(api_key=api_key)
-        self.embedding_model = "text-embedding-3-small"
-    def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
-        if not text or pd.isna(text):
-            return [""]
-        text = str(text).strip()
-        if not text:
-            return [""]
-        sentences = re.split(r'(?<=[.!؟])\s+', text)  
-        chunks = []
-        current_chunk = []
-        current_word_count = 0
-        for sentence in sentences:
-            sentence_words = len(sentence.split())
-            if current_word_count + sentence_words > chunk_size and current_chunk:
-                chunks.append(' '.join(current_chunk))
-                current_chunk = [sentence]
-                current_word_count = sentence_words
-            else:
-                current_chunk.append(sentence)
-                current_word_count += sentence_words
-        if current_chunk:
-            chunks.append(' '.join(current_chunk))
-        return chunks if chunks else [""]
-    def get_embedding(self, text: str) -> List[float]:
-        try:
-            text = str(text).strip()
-            if not text:
-                text = "empty"
-            response = self.client.embeddings.create(
-                model=self.embedding_model,
-                input=text,
-                encoding_format="float"
-            )
-            return response.data[0].embedding
-        except Exception as e:
-            print(f"Error generating embedding: {str(e)}")
-            return [0.0] * 1536  # vector placeholder
-    def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
-        print(f"Reading CSV file: {input_file}")
-        try:
-            df = pd.read_csv(input_file)
-            column_mapping = {
-                "الوحدة": "Unit",
-                "المفهوم": "Concept",
-                "الدرس": "Lesson",
-                "من صفحة": "From page",
-                "إلى صفحة": "To page",
-                "النص": "Lesson text",
-            }
-            df.rename(columns=column_mapping, inplace=True)
-            required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
-            missing_columns = [col for col in required_columns if col not in df.columns]
-            if missing_columns:
-                raise ValueError(f"Missing required columns: {missing_columns}")
-            print(f"Found {len(df)} rows in input file")
-            output_rows = []
-            for idx, row in df.iterrows():
-                print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
-                lesson_text = row['Lesson text']
-                chunks = self.chunk_text(lesson_text, chunk_size)
-                print(f"  Created {len(chunks)} chunks")
-                for chunk_idx, chunk_text in enumerate(chunks):
-                    print(f"  Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
-                    embedding = self.get_embedding(chunk_text)
-                    output_row = {
-                        'Grade': grade if grade is not None else row.get('Grade', None),
-                        'Subject': subject if subject is not None else row.get('Subject', None),
-                        'Unit': row['Unit'],
-                        'Concept': row['Concept'],
-                        'Lesson': row['Lesson'],
-                        'From page': row['From page'],
-                        'To page': row['To page'],
-                        'Chunk index': chunk_idx,
-                        'Chunk text': chunk_text,
-                        'Is Arabic': False, 
-                        'Embedding': json.dumps(embedding)
-                    }
-                    output_rows.append(output_row)
-            print(f"Saving {len(output_rows)} chunks to {output_file}")
-            output_df = pd.DataFrame(output_rows)
-            output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
-            print("Processing complete!")
-        except Exception as e:
-            print(f"Error processing file: {str(e)}")
-            raise
-def main():
-    processor = EducationalContentProcessor()
-    input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
-    output_file = "Prime6_en_chunked_with_embeddings.csv"
-    processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
-if __name__ == "__main__":
-    main()