Commit 3d886bb2 authored by arwa mohamed's avatar arwa mohamed

Add embedding scripts with Arabic/English support

parent 91855fed
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import numpy as np
import os
import psycopg2
import openai
from psycopg2.extras import execute_values
import re
from openai import OpenAI
from typing import List
import csv
import json
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
user=os.getenv("POSTGRES_USER", "db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def chunk_text(text, chunk_size=500, overlap=50):
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
sentences = re.split(r'(?<=[.!؟])\s+', text)
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + chunk_size)
chunks.append(text[start:end])
start = end - overlap
if start < 0:
start = 0
return chunks
def get_embedding(text):
response = openai.embeddings.create(
model="text-embedding-3-small",
input=text
current_chunk = []
current_word_count = 0
for sentence in sentences:
sentence_words = len(sentence.split())
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536 # vector placeholder
def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file)
column_mapping = {
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text",
}
df.rename(columns=column_mapping, inplace=True)
required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
print(f"Found {len(df)} rows in input file")
output_rows = []
for idx, row in df.iterrows():
print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row['Lesson text']
chunks = self.chunk_text(lesson_text, chunk_size)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_row = {
'Grade': grade if grade is not None else row.get('Grade', None),
'Subject': subject if subject is not None else row.get('Subject', None),
'Unit': row['Unit'],
'Concept': row['Concept'],
'Lesson': row['Lesson'],
'From page': row['From page'],
'To page': row['To page'],
'Chunk index': chunk_idx,
'Chunk text': chunk_text,
'Is Arabic': False,
'Embedding': json.dumps(embedding)
}
output_rows.append(output_row)
print(f"Saving {len(output_rows)} chunks to {output_file}")
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
print("Processing complete!")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
conn = get_db_connection()
cur = conn.cursor()
print("Fetching lessons...")
cur.execute("SELECT id, lesson_text FROM lessons WHERE lesson_text IS NOT NULL;")
lessons = cur.fetchall()
total_lessons = len(lessons)
print(f"Found {total_lessons} lessons to process.")
all_rows = []
for idx, (lesson_id, lesson_text) in enumerate(lessons, start=1):
chunks = chunk_text(lesson_text, chunk_size=500, overlap=50)
for i, chunk in enumerate(chunks):
embedding = get_embedding(chunk)
all_rows.append((lesson_id, i, chunk, embedding))
progress = (idx / total_lessons) * 100
print(f"Lesson {idx}/{total_lessons} complete ({progress:.2f}% done, {len(chunks)} chunks)")
# وقف بعد أول درسين للتجربة
if idx == 2:
print("Stopping after first 2 lessons (test mode).")
break
if all_rows:
query = """
INSERT INTO lesson_embeddings (lesson_id, chunk_index, chunk_text, embedding)
VALUES %s
"""
execute_values(cur, query, all_rows)
conn.commit()
cur.close()
conn.close()
print(f"Inserted {len(all_rows)} embeddings into the database.")
processor = EducationalContentProcessor()
input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
output_file = "Prime6_en_chunked_with_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
if __name__ == "__main__":
main()
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
if is_arabic:
sentence_pattern = r'(?<=[.!?؟])\s+'
else:
sentence_pattern = r'(?<=[.!?])\s+'
sentences = re.split(sentence_pattern, text)
chunks, current_chunk, current_word_count = [], [], 0
for sentence in sentences:
sentence_words = len([w for w in sentence.split() if w.strip()])
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536
def detect_arabic_text(self, text: str) -> bool:
if not text or pd.isna(text):
return False
text = str(text)
arabic_chars, total_chars = 0, 0
for char in text:
if char.strip():
total_chars += 1
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
or ('\uFE70' <= char <= '\uFEFF'):
arabic_chars += 1
return total_chars > 0 and (arabic_chars / total_chars) > 0.3
def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file, encoding="utf-8")
column_map = {
"Unit": ["Unit", "الوحدة"],
"Concept": ["Concept", "المفهوم"],
"Lesson": ["Lesson", "الدرس"],
"From page": ["From page", "من صفحة"],
"To page": ["To page", "إلى صفحة"],
"Lesson text": ["Lesson text", "النص"]
}
normalized = {}
for std_name, aliases in column_map.items():
for alias in aliases:
if alias in df.columns:
normalized[std_name] = df[alias]
break
if std_name not in normalized:
normalized[std_name] = ""
norm_df = pd.DataFrame(normalized)
print(f"Found {len(norm_df)} rows in input file")
output_rows = []
for idx, row in norm_df.iterrows():
print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row["Lesson text"]
if is_arabic is None:
text_is_arabic = self.detect_arabic_text(lesson_text)
else:
text_is_arabic = is_arabic
chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_rows.append({
"Grade": grade,
"Subject": subject,
"Unit": row["Unit"],
"Concept": row["Concept"],
"Lesson": row["Lesson"],
"From page": row["From page"],
"To page": row["To page"],
"Chunk index": chunk_idx,
"Chunk text": chunk_text,
"Is Arabic": text_is_arabic,
"Embedding": json.dumps(embedding)
})
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
output_file = "prime4_ar_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
if __name__ == "__main__":
main()
import os
import psycopg2
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv()
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
user=os.getenv("POSTGRES_USER", "db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def insert_chunks_from_csv(csv_file: str):
df = pd.read_csv(csv_file)
required_cols = [
"Grade", "Subject", "Unit", "Concept", "Lesson",
"From page", "To page", "Chunk index", "Chunk text",
"Is Arabic", "Embedding"
]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Missing required column in CSV: {col}")
conn = get_db_connection()
cur = conn.cursor()
insert_query = """
INSERT INTO educational_chunks
(grade, subject, unit, concept, lesson,
from_page, to_page, chunk_index, chunk_text,
is_arabic, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
batch_size = 50
buffer = []
for idx, row in df.iterrows():
try:
embedding = json.loads(row["Embedding"]) # JSON → list
buffer.append((
row["Grade"],
row["Subject"],
row.get("Unit"),
row.get("Concept"),
row.get("Lesson"),
int(row["From page"]) if not pd.isna(row["From page"]) else None,
int(row["To page"]) if not pd.isna(row["To page"]) else None,
int(row["Chunk index"]),
row["Chunk text"],
bool(row["Is Arabic"]),
embedding
))
except Exception as e:
print(f"Skipping row {idx} due to error: {e}")
continue
if len(buffer) >= batch_size:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted {len(buffer)} rows...")
buffer = []
if buffer:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted final {len(buffer)} rows.")
cur.close()
conn.close()
print("All data inserted successfully.")
if __name__ == "__main__":
csv_file = "Prime6_en_chunked_with_embeddings.csv"
insert_chunks_from_csv(csv_file)
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment