Commit 3d886bb2 authored by arwa mohamed's avatar arwa mohamed

Add embedding scripts with Arabic/English support

parent 91855fed
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import numpy as np
import os import os
import psycopg2 import re
import openai from openai import OpenAI
from psycopg2.extras import execute_values from typing import List
import csv
import json
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def get_db_connection(): class EducationalContentProcessor:
return psycopg2.connect( def __init__(self, api_key: str = None):
dbname=os.getenv("POSTGRES_DB", "embeddings_db"), if api_key is None:
user=os.getenv("POSTGRES_USER", "db_admin"), api_key = os.getenv('OPENAI_API_KEY')
password=os.getenv("POSTGRES_PASSWORD"), if not api_key:
host=os.getenv("POSTGRES_HOST", "localhost"), raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
port=os.getenv("POSTGRES_PORT", 5432)
) self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(text, chunk_size=500, overlap=50):
chunks = [] def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
start = 0 if not text or pd.isna(text):
while start < len(text): return [""]
end = min(len(text), start + chunk_size)
chunks.append(text[start:end]) text = str(text).strip()
start = end - overlap if not text:
if start < 0: return [""]
start = 0
return chunks sentences = re.split(r'(?<=[.!؟])\s+', text)
chunks = []
def get_embedding(text): current_chunk = []
response = openai.embeddings.create( current_word_count = 0
model="text-embedding-3-small",
input=text for sentence in sentences:
) sentence_words = len(sentence.split())
return response.data[0].embedding if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536 # vector placeholder
def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file)
column_mapping = {
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text",
}
df.rename(columns=column_mapping, inplace=True)
required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
print(f"Found {len(df)} rows in input file")
output_rows = []
for idx, row in df.iterrows():
print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row['Lesson text']
chunks = self.chunk_text(lesson_text, chunk_size)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_row = {
'Grade': grade if grade is not None else row.get('Grade', None),
'Subject': subject if subject is not None else row.get('Subject', None),
'Unit': row['Unit'],
'Concept': row['Concept'],
'Lesson': row['Lesson'],
'From page': row['From page'],
'To page': row['To page'],
'Chunk index': chunk_idx,
'Chunk text': chunk_text,
'Is Arabic': False,
'Embedding': json.dumps(embedding)
}
output_rows.append(output_row)
print(f"Saving {len(output_rows)} chunks to {output_file}")
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
print("Processing complete!")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main(): def main():
conn = get_db_connection() processor = EducationalContentProcessor()
cur = conn.cursor() input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
output_file = "Prime6_en_chunked_with_embeddings.csv"
print("Fetching lessons...")
cur.execute("SELECT id, lesson_text FROM lessons WHERE lesson_text IS NOT NULL;") processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
lessons = cur.fetchall()
total_lessons = len(lessons)
print(f"Found {total_lessons} lessons to process.")
all_rows = []
for idx, (lesson_id, lesson_text) in enumerate(lessons, start=1):
chunks = chunk_text(lesson_text, chunk_size=500, overlap=50)
for i, chunk in enumerate(chunks):
embedding = get_embedding(chunk)
all_rows.append((lesson_id, i, chunk, embedding))
progress = (idx / total_lessons) * 100
print(f"Lesson {idx}/{total_lessons} complete ({progress:.2f}% done, {len(chunks)} chunks)")
# وقف بعد أول درسين للتجربة
if idx == 2:
print("Stopping after first 2 lessons (test mode).")
break
if all_rows:
query = """
INSERT INTO lesson_embeddings (lesson_id, chunk_index, chunk_text, embedding)
VALUES %s
"""
execute_values(cur, query, all_rows)
conn.commit()
cur.close()
conn.close()
print(f"Inserted {len(all_rows)} embeddings into the database.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
if is_arabic:
sentence_pattern = r'(?<=[.!?؟])\s+'
else:
sentence_pattern = r'(?<=[.!?])\s+'
sentences = re.split(sentence_pattern, text)
chunks, current_chunk, current_word_count = [], [], 0
for sentence in sentences:
sentence_words = len([w for w in sentence.split() if w.strip()])
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536
def detect_arabic_text(self, text: str) -> bool:
if not text or pd.isna(text):
return False
text = str(text)
arabic_chars, total_chars = 0, 0
for char in text:
if char.strip():
total_chars += 1
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
or ('\uFE70' <= char <= '\uFEFF'):
arabic_chars += 1
return total_chars > 0 and (arabic_chars / total_chars) > 0.3
def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file, encoding="utf-8")
column_map = {
"Unit": ["Unit", "الوحدة"],
"Concept": ["Concept", "المفهوم"],
"Lesson": ["Lesson", "الدرس"],
"From page": ["From page", "من صفحة"],
"To page": ["To page", "إلى صفحة"],
"Lesson text": ["Lesson text", "النص"]
}
normalized = {}
for std_name, aliases in column_map.items():
for alias in aliases:
if alias in df.columns:
normalized[std_name] = df[alias]
break
if std_name not in normalized:
normalized[std_name] = ""
norm_df = pd.DataFrame(normalized)
print(f"Found {len(norm_df)} rows in input file")
output_rows = []
for idx, row in norm_df.iterrows():
print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row["Lesson text"]
if is_arabic is None:
text_is_arabic = self.detect_arabic_text(lesson_text)
else:
text_is_arabic = is_arabic
chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_rows.append({
"Grade": grade,
"Subject": subject,
"Unit": row["Unit"],
"Concept": row["Concept"],
"Lesson": row["Lesson"],
"From page": row["From page"],
"To page": row["To page"],
"Chunk index": chunk_idx,
"Chunk text": chunk_text,
"Is Arabic": text_is_arabic,
"Embedding": json.dumps(embedding)
})
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
output_file = "prime4_ar_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
if __name__ == "__main__":
main()
import os
import psycopg2
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv()
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
user=os.getenv("POSTGRES_USER", "db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def insert_chunks_from_csv(csv_file: str):
df = pd.read_csv(csv_file)
required_cols = [
"Grade", "Subject", "Unit", "Concept", "Lesson",
"From page", "To page", "Chunk index", "Chunk text",
"Is Arabic", "Embedding"
]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Missing required column in CSV: {col}")
conn = get_db_connection()
cur = conn.cursor()
insert_query = """
INSERT INTO educational_chunks
(grade, subject, unit, concept, lesson,
from_page, to_page, chunk_index, chunk_text,
is_arabic, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
batch_size = 50
buffer = []
for idx, row in df.iterrows():
try:
embedding = json.loads(row["Embedding"]) # JSON → list
buffer.append((
row["Grade"],
row["Subject"],
row.get("Unit"),
row.get("Concept"),
row.get("Lesson"),
int(row["From page"]) if not pd.isna(row["From page"]) else None,
int(row["To page"]) if not pd.isna(row["To page"]) else None,
int(row["Chunk index"]),
row["Chunk text"],
bool(row["Is Arabic"]),
embedding
))
except Exception as e:
print(f"Skipping row {idx} due to error: {e}")
continue
if len(buffer) >= batch_size:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted {len(buffer)} rows...")
buffer = []
if buffer:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted final {len(buffer)} rows.")
cur.close()
conn.close()
print("All data inserted successfully.")
if __name__ == "__main__":
csv_file = "Prime6_en_chunked_with_embeddings.csv"
insert_chunks_from_csv(csv_file)
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment