Commit 91855fed authored by arwa mohamed's avatar arwa mohamed

Add scripts for inserting lessons and generating embeddings

parent 5148341d
import os
import psycopg2
import openai
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB", "embeddings_db"),
user=os.getenv("POSTGRES_USER", "db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def chunk_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + chunk_size)
chunks.append(text[start:end])
start = end - overlap
if start < 0:
start = 0
return chunks
def get_embedding(text):
response = openai.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def main():
conn = get_db_connection()
cur = conn.cursor()
print("Fetching lessons...")
cur.execute("SELECT id, lesson_text FROM lessons WHERE lesson_text IS NOT NULL;")
lessons = cur.fetchall()
total_lessons = len(lessons)
print(f"Found {total_lessons} lessons to process.")
all_rows = []
for idx, (lesson_id, lesson_text) in enumerate(lessons, start=1):
chunks = chunk_text(lesson_text, chunk_size=500, overlap=50)
for i, chunk in enumerate(chunks):
embedding = get_embedding(chunk)
all_rows.append((lesson_id, i, chunk, embedding))
progress = (idx / total_lessons) * 100
print(f"Lesson {idx}/{total_lessons} complete ({progress:.2f}% done, {len(chunks)} chunks)")
# وقف بعد أول درسين للتجربة
if idx == 2:
print("Stopping after first 2 lessons (test mode).")
break
if all_rows:
query = """
INSERT INTO lesson_embeddings (lesson_id, chunk_index, chunk_text, embedding)
VALUES %s
"""
execute_values(cur, query, all_rows)
conn.commit()
cur.close()
conn.close()
print(f"Inserted {len(all_rows)} embeddings into the database.")
if __name__ == "__main__":
main()
import os
import psycopg2
import pandas as pd
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv()
def get_db_connection():
return psycopg2.connect(
dbname=os.getenv("POSTGRES_DB","embeddings_db"),
user=os.getenv("POSTGRES_USER","db_admin"),
password=os.getenv("POSTGRES_PASSWORD"),
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432)
)
def insert_lessons_from_csv(file_path, conn, grade, subject):
df = pd.read_csv(file_path)
df.rename(columns={
"الوحدة": "Unit",
"المفهوم": "Concept",
"الدرس": "Lesson",
"من صفحة": "From page",
"إلى صفحة": "To page",
"النص": "Lesson text"
}, inplace=True)
required_columns = ["Unit", "Concept", "Lesson", "From page", "To page", "Lesson text"]
missing = [col for col in required_columns if col not in df.columns]
if missing:
print(f"⚠️ Missing columns in {file_path}: {', '.join(missing)}")
return
rows = []
for _, row in df.iterrows():
rows.append((
grade,
subject,
row["Unit"],
row["Concept"],
row["Lesson"],
row["From page"],
row["To page"],
row["Lesson text"]
))
query = """
INSERT INTO lessons (grade, subject, unit, concept, lesson, start_page, end_page, lesson_text)
VALUES %s
"""
with conn.cursor() as cur:
execute_values(cur, query, rows)
conn.commit()
print(f" Inserted {len(rows)} rows from {os.path.basename(file_path)}")
def main():
folder = input("Enter the path to the folder containing CSV files: ").strip()
if not os.path.exists(folder):
print("Folder not found.")
return
files = [f for f in os.listdir(folder) if f.endswith(".csv")]
if not files:
print(" No CSV files found.")
return
print("Available files:")
for i, f in enumerate(files, 1):
print(f"{i}. {f}")
selected = input("Enter the numbers of the files you want to import (e.g., 1 3 4): ").split()
selected_files = [files[int(i) - 1] for i in selected]
grade = input("Enter grade manually (e.g., Grade 5): ").strip()
subject = input("Enter subject manually (default: Science): ").strip() or "Science"
conn = get_db_connection()
try:
for f in selected_files:
file_path = os.path.join(folder, f)
insert_lessons_from_csv(file_path, conn, grade, subject)
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM lessons;")
total = cur.fetchone()[0]
print(f" Total rows in lessons table: {total}")
finally:
conn.close()
print(" Connection closed.")
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment