Commit 7914aadf authored by Salma Mohammed Hamed's avatar Salma Mohammed Hamed

Delete generate_embeddings_ar.py

parent 95189ab5
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
if is_arabic:
sentence_pattern = r'(?<=[.!?؟])\s+'
else:
sentence_pattern = r'(?<=[.!?])\s+'
sentences = re.split(sentence_pattern, text)
chunks, current_chunk, current_word_count = [], [], 0
for sentence in sentences:
sentence_words = len([w for w in sentence.split() if w.strip()])
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536
def detect_arabic_text(self, text: str) -> bool:
if not text or pd.isna(text):
return False
text = str(text)
arabic_chars, total_chars = 0, 0
for char in text:
if char.strip():
total_chars += 1
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
or ('\uFE70' <= char <= '\uFEFF'):
arabic_chars += 1
return total_chars > 0 and (arabic_chars / total_chars) > 0.3
def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file, encoding="utf-8")
column_map = {
"Unit": ["Unit", "الوحدة"],
"Concept": ["Concept", "المفهوم"],
"Lesson": ["Lesson", "الدرس"],
"From page": ["From page", "من صفحة"],
"To page": ["To page", "إلى صفحة"],
"Lesson text": ["Lesson text", "النص"]
}
normalized = {}
for std_name, aliases in column_map.items():
for alias in aliases:
if alias in df.columns:
normalized[std_name] = df[alias]
break
if std_name not in normalized:
normalized[std_name] = ""
norm_df = pd.DataFrame(normalized)
print(f"Found {len(norm_df)} rows in input file")
output_rows = []
for idx, row in norm_df.iterrows():
print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row["Lesson text"]
if is_arabic is None:
text_is_arabic = self.detect_arabic_text(lesson_text)
else:
text_is_arabic = is_arabic
chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_rows.append({
"Grade": grade,
"Subject": subject,
"Unit": row["Unit"],
"Concept": row["Concept"],
"Lesson": row["Lesson"],
"From page": row["From page"],
"To page": row["To page"],
"Chunk index": chunk_idx,
"Chunk text": chunk_text,
"Is Arabic": text_is_arabic,
"Embedding": json.dumps(embedding)
})
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
output_file = "prime4_ar_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment