Commit 95189ab5 authored by Salma Mohammed Hamed's avatar Salma Mohammed Hamed

Delete generate_embeddings.py

parent 7446c2b4
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
from dotenv import load_dotenv
load_dotenv()
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
sentences = re.split(r'(?<=[.!؟])\s+', text)
chunks = []
current_chunk = []
current_word_count = 0
for sentence in sentences:
sentence_words = len(sentence.split())
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536 # vector placeholder
def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file)
column_mapping = {
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text",
}
df.rename(columns=column_mapping, inplace=True)
required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
print(f"Found {len(df)} rows in input file")
output_rows = []
for idx, row in df.iterrows():
print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row['Lesson text']
chunks = self.chunk_text(lesson_text, chunk_size)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_row = {
'Grade': grade if grade is not None else row.get('Grade', None),
'Subject': subject if subject is not None else row.get('Subject', None),
'Unit': row['Unit'],
'Concept': row['Concept'],
'Lesson': row['Lesson'],
'From page': row['From page'],
'To page': row['To page'],
'Chunk index': chunk_idx,
'Chunk text': chunk_text,
'Is Arabic': False,
'Embedding': json.dumps(embedding)
}
output_rows.append(output_row)
print(f"Saving {len(output_rows)} chunks to {output_file}")
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
print("Processing complete!")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
output_file = "Prime6_en_chunked_with_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment