Commit 6d1a52f6 authored by Salma Mohammed Hamed's avatar Salma Mohammed Hamed

Merge branch 'add_extracted_data' into 'master'

Add extracted data

See merge request !3
parents 3e935489 3d886bb2
This diff is collapsed.
This diff is collapsed.
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
from dotenv import load_dotenv
load_dotenv()
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
sentences = re.split(r'(?<=[.!؟])\s+', text)
chunks = []
current_chunk = []
current_word_count = 0
for sentence in sentences:
sentence_words = len(sentence.split())
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536 # vector placeholder
def process_csv(self, input_file: str, output_file: str, chunk_size: int = 500, grade: int = None, subject: str = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file)
column_mapping = {
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text",
}
df.rename(columns=column_mapping, inplace=True)
required_columns = ['Unit', 'Concept', 'Lesson', 'From page', 'To page', 'Lesson text']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
print(f"Found {len(df)} rows in input file")
output_rows = []
for idx, row in df.iterrows():
print(f"Processing row {idx + 1}/{len(df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row['Lesson text']
chunks = self.chunk_text(lesson_text, chunk_size)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx + 1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_row = {
'Grade': grade if grade is not None else row.get('Grade', None),
'Subject': subject if subject is not None else row.get('Subject', None),
'Unit': row['Unit'],
'Concept': row['Concept'],
'Lesson': row['Lesson'],
'From page': row['From page'],
'To page': row['To page'],
'Chunk index': chunk_idx,
'Chunk text': chunk_text,
'Is Arabic': False,
'Embedding': json.dumps(embedding)
}
output_rows.append(output_row)
print(f"Saving {len(output_rows)} chunks to {output_file}")
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL)
print("Processing complete!")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/english/prime6/output_units_lessons_prime6_EN.csv"
output_file = "Prime6_en_chunked_with_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500, grade="prime6", subject="Science")
if __name__ == "__main__":
main()
import pandas as pd
import numpy as np
import os
import re
from openai import OpenAI
from typing import List
import csv
import json
class EducationalContentProcessor:
def __init__(self, api_key: str = None):
if api_key is None:
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY in .env or pass api_key parameter.")
self.client = OpenAI(api_key=api_key)
self.embedding_model = "text-embedding-3-small"
def chunk_text(self, text: str, chunk_size: int = 500, is_arabic: bool = False) -> List[str]:
if not text or pd.isna(text):
return [""]
text = str(text).strip()
if not text:
return [""]
if is_arabic:
sentence_pattern = r'(?<=[.!?؟])\s+'
else:
sentence_pattern = r'(?<=[.!?])\s+'
sentences = re.split(sentence_pattern, text)
chunks, current_chunk, current_word_count = [], [], 0
for sentence in sentences:
sentence_words = len([w for w in sentence.split() if w.strip()])
if current_word_count + sentence_words > chunk_size and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_word_count = sentence_words
else:
current_chunk.append(sentence)
current_word_count += sentence_words
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks if chunks else [""]
def get_embedding(self, text: str) -> List[float]:
try:
text = str(text).strip()
if not text:
text = "empty"
response = self.client.embeddings.create(
model=self.embedding_model,
input=text,
encoding_format="float"
)
return response.data[0].embedding
except Exception as e:
print(f"Error generating embedding: {str(e)}")
return [0.0] * 1536
def detect_arabic_text(self, text: str) -> bool:
if not text or pd.isna(text):
return False
text = str(text)
arabic_chars, total_chars = 0, 0
for char in text:
if char.strip():
total_chars += 1
if ('\u0600' <= char <= '\u06FF') or ('\u0750' <= char <= '\u077F') \
or ('\u08A0' <= char <= '\u08FF') or ('\uFB50' <= char <= '\uFDFF') \
or ('\uFE70' <= char <= '\uFEFF'):
arabic_chars += 1
return total_chars > 0 and (arabic_chars / total_chars) > 0.3
def process_csv(self, input_file: str, output_file: str, subject: str, grade: int, chunk_size: int = 500, is_arabic: bool = None):
print(f"Reading CSV file: {input_file}")
try:
df = pd.read_csv(input_file, encoding="utf-8")
column_map = {
"Unit": ["Unit", "الوحدة"],
"Concept": ["Concept", "المفهوم"],
"Lesson": ["Lesson", "الدرس"],
"From page": ["From page", "من صفحة"],
"To page": ["To page", "إلى صفحة"],
"Lesson text": ["Lesson text", "النص"]
}
normalized = {}
for std_name, aliases in column_map.items():
for alias in aliases:
if alias in df.columns:
normalized[std_name] = df[alias]
break
if std_name not in normalized:
normalized[std_name] = ""
norm_df = pd.DataFrame(normalized)
print(f"Found {len(norm_df)} rows in input file")
output_rows = []
for idx, row in norm_df.iterrows():
print(f"Processing row {idx+1}/{len(norm_df)}: {row['Unit']} - {row['Concept']} - {row['Lesson']}")
lesson_text = row["Lesson text"]
if is_arabic is None:
text_is_arabic = self.detect_arabic_text(lesson_text)
else:
text_is_arabic = is_arabic
chunks = self.chunk_text(lesson_text, chunk_size, is_arabic=text_is_arabic)
print(f" Created {len(chunks)} chunks")
for chunk_idx, chunk_text in enumerate(chunks):
print(f" Generating embedding for chunk {chunk_idx+1}/{len(chunks)}")
embedding = self.get_embedding(chunk_text)
output_rows.append({
"Grade": grade,
"Subject": subject,
"Unit": row["Unit"],
"Concept": row["Concept"],
"Lesson": row["Lesson"],
"From page": row["From page"],
"To page": row["To page"],
"Chunk index": chunk_idx,
"Chunk text": chunk_text,
"Is Arabic": text_is_arabic,
"Embedding": json.dumps(embedding)
})
output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False, quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
print(f"Processing complete! Saved {len(output_rows)} chunks to {output_file}")
except Exception as e:
print(f"Error processing file: {str(e)}")
raise
def main():
processor = EducationalContentProcessor()
input_file = r"../Data/arabic/prime4/output_units_lessons_prime4.csv"
output_file = "prime4_ar_embeddings.csv"
processor.process_csv(input_file, output_file, chunk_size=500,grade="prime4", subject="Science")
if __name__ == "__main__":
main()
import os
import psycopg2
import pandas as pd
import json
from dotenv import load_dotenv
load_dotenv()
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
user=os.getenv("POSTGRES_USER", "db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def insert_chunks_from_csv(csv_file: str):
df = pd.read_csv(csv_file)
required_cols = [
"Grade", "Subject", "Unit", "Concept", "Lesson",
"From page", "To page", "Chunk index", "Chunk text",
"Is Arabic", "Embedding"
]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Missing required column in CSV: {col}")
conn = get_db_connection()
cur = conn.cursor()
insert_query = """
INSERT INTO educational_chunks
(grade, subject, unit, concept, lesson,
from_page, to_page, chunk_index, chunk_text,
is_arabic, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
batch_size = 50
buffer = []
for idx, row in df.iterrows():
try:
embedding = json.loads(row["Embedding"]) # JSON → list
buffer.append((
row["Grade"],
row["Subject"],
row.get("Unit"),
row.get("Concept"),
row.get("Lesson"),
int(row["From page"]) if not pd.isna(row["From page"]) else None,
int(row["To page"]) if not pd.isna(row["To page"]) else None,
int(row["Chunk index"]),
row["Chunk text"],
bool(row["Is Arabic"]),
embedding
))
except Exception as e:
print(f"Skipping row {idx} due to error: {e}")
continue
if len(buffer) >= batch_size:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted {len(buffer)} rows...")
buffer = []
if buffer:
cur.executemany(insert_query, buffer)
conn.commit()
print(f"Inserted final {len(buffer)} rows.")
cur.close()
conn.close()
print("All data inserted successfully.")
if __name__ == "__main__":
csv_file = "Prime6_en_chunked_with_embeddings.csv"
insert_chunks_from_csv(csv_file)
import os
import psycopg2
import pandas as pd
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv()
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB","embeddings_db"),
user=os.getenv("POSTGRES_USER","db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def insert_lessons_from_csv(file_path, conn, grade, subject):
df = pd.read_csv(file_path)
df.rename(columns={
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text"
}, inplace=True)
required_columns = ["Unit", "Concept", "Lesson", "From page", "To page", "Lesson text"]
missing = [col for col in required_columns if col not in df.columns]
if missing:
print(f"⚠️ Missing columns in {file_path}: {', '.join(missing)}")
return
rows = []
for _, row in df.iterrows():
rows.append((
grade,
subject,
row["Unit"],
row["Concept"],
row["Lesson"],
row["From page"],
row["To page"],
row["Lesson text"]
))
query = """
INSERT INTO lessons (grade, subject, unit, concept, lesson, start_page, end_page, lesson_text)
VALUES %s
"""
with conn.cursor() as cur:
execute_values(cur, query, rows)
conn.commit()
print(f" Inserted {len(rows)} rows from {os.path.basename(file_path)}")
def main():
folder = input("Enter the path to the folder containing CSV files: ").strip()
if not os.path.exists(folder):
print("Folder not found.")
return
files = [f for f in os.listdir(folder) if f.endswith(".csv")]
if not files:
print(" No CSV files found.")
return
print("Available files:")
for i, f in enumerate(files, 1):
print(f"{i}. {f}")
selected = input("Enter the numbers of the files you want to import (e.g., 1 3 4): ").split()
selected_files = [files[int(i) - 1] for i in selected]
grade = input("Enter grade manually (e.g., Grade 5): ").strip()
subject = input("Enter subject manually (default: Science): ").strip() or "Science"
conn = get_db_connection()
try:
for f in selected_files:
file_path = os.path.join(folder, f)
insert_lessons_from_csv(file_path, conn, grade, subject)
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM lessons;")
total = cur.fetchone()[0]
print(f" Total rows in lessons table: {total}")
finally:
conn.close()
print(" Connection closed.")
if __name__ == "__main__":
main()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment