Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
91855fed
Commit
91855fed
authored
Sep 15, 2025
by
arwa mohamed
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add scripts for inserting lessons and generating embeddings
parent
5148341d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
171 additions
and
0 deletions
+171
-0
generate_embeddings.py
self_hosted_env/generate_embeddings.py
+75
-0
insert_lessons.py
self_hosted_env/insert_lessons.py
+96
-0
No files found.
self_hosted_env/generate_embeddings.py
0 → 100644
View file @
91855fed
import
os
import
psycopg2
import
openai
from
psycopg2.extras
import
execute_values
from
dotenv
import
load_dotenv
load_dotenv
()
openai
.
api_key
=
os
.
getenv
(
"OPENAI_API_KEY"
)
def
get_db_connection
():
return
psycopg2
.
connect
(
dbname
=
os
.
getenv
(
"POSTGRES_DB"
,
"embeddings_db"
),
user
=
os
.
getenv
(
"POSTGRES_USER"
,
"db_admin"
),
password
=
os
.
getenv
(
"POSTGRES_PASSWORD"
),
host
=
os
.
getenv
(
"POSTGRES_HOST"
,
"localhost"
),
port
=
os
.
getenv
(
"POSTGRES_PORT"
,
5432
)
)
def
chunk_text
(
text
,
chunk_size
=
500
,
overlap
=
50
):
chunks
=
[]
start
=
0
while
start
<
len
(
text
):
end
=
min
(
len
(
text
),
start
+
chunk_size
)
chunks
.
append
(
text
[
start
:
end
])
start
=
end
-
overlap
if
start
<
0
:
start
=
0
return
chunks
def
get_embedding
(
text
):
response
=
openai
.
embeddings
.
create
(
model
=
"text-embedding-3-small"
,
input
=
text
)
return
response
.
data
[
0
]
.
embedding
def
main
():
conn
=
get_db_connection
()
cur
=
conn
.
cursor
()
print
(
"Fetching lessons..."
)
cur
.
execute
(
"SELECT id, lesson_text FROM lessons WHERE lesson_text IS NOT NULL;"
)
lessons
=
cur
.
fetchall
()
total_lessons
=
len
(
lessons
)
print
(
f
"Found {total_lessons} lessons to process."
)
all_rows
=
[]
for
idx
,
(
lesson_id
,
lesson_text
)
in
enumerate
(
lessons
,
start
=
1
):
chunks
=
chunk_text
(
lesson_text
,
chunk_size
=
500
,
overlap
=
50
)
for
i
,
chunk
in
enumerate
(
chunks
):
embedding
=
get_embedding
(
chunk
)
all_rows
.
append
((
lesson_id
,
i
,
chunk
,
embedding
))
progress
=
(
idx
/
total_lessons
)
*
100
print
(
f
"Lesson {idx}/{total_lessons} complete ({progress:.2f}
%
done, {len(chunks)} chunks)"
)
# وقف بعد أول درسين للتجربة
if
idx
==
2
:
print
(
"Stopping after first 2 lessons (test mode)."
)
break
if
all_rows
:
query
=
"""
INSERT INTO lesson_embeddings (lesson_id, chunk_index, chunk_text, embedding)
VALUES
%
s
"""
execute_values
(
cur
,
query
,
all_rows
)
conn
.
commit
()
cur
.
close
()
conn
.
close
()
print
(
f
"Inserted {len(all_rows)} embeddings into the database."
)
if
__name__
==
"__main__"
:
main
()
self_hosted_env/insert_lessons.py
0 → 100644
View file @
91855fed
import
os
import
psycopg2
import
pandas
as
pd
from
psycopg2.extras
import
execute_values
from
dotenv
import
load_dotenv
load_dotenv
()
def
get_db_connection
():
return
psycopg2
.
connect
(
dbname
=
os
.
getenv
(
"POSTGRES_DB"
,
"embeddings_db"
),
user
=
os
.
getenv
(
"POSTGRES_USER"
,
"db_admin"
),
password
=
os
.
getenv
(
"POSTGRES_PASSWORD"
),
host
=
os
.
getenv
(
"POSTGRES_HOST"
,
"localhost"
),
port
=
os
.
getenv
(
"POSTGRES_PORT"
,
5432
)
)
def
insert_lessons_from_csv
(
file_path
,
conn
,
grade
,
subject
):
df
=
pd
.
read_csv
(
file_path
)
df
.
rename
(
columns
=
{
"الوحدة"
:
"Unit"
,
"المفهوم"
:
"Concept"
,
"الدرس"
:
"Lesson"
,
"من صفحة"
:
"From page"
,
"إلى صفحة"
:
"To page"
,
"النص"
:
"Lesson text"
},
inplace
=
True
)
required_columns
=
[
"Unit"
,
"Concept"
,
"Lesson"
,
"From page"
,
"To page"
,
"Lesson text"
]
missing
=
[
col
for
col
in
required_columns
if
col
not
in
df
.
columns
]
if
missing
:
print
(
f
"⚠️ Missing columns in {file_path}: {', '.join(missing)}"
)
return
rows
=
[]
for
_
,
row
in
df
.
iterrows
():
rows
.
append
((
grade
,
subject
,
row
[
"Unit"
],
row
[
"Concept"
],
row
[
"Lesson"
],
row
[
"From page"
],
row
[
"To page"
],
row
[
"Lesson text"
]
))
query
=
"""
INSERT INTO lessons (grade, subject, unit, concept, lesson, start_page, end_page, lesson_text)
VALUES
%
s
"""
with
conn
.
cursor
()
as
cur
:
execute_values
(
cur
,
query
,
rows
)
conn
.
commit
()
print
(
f
" Inserted {len(rows)} rows from {os.path.basename(file_path)}"
)
def
main
():
folder
=
input
(
"Enter the path to the folder containing CSV files: "
)
.
strip
()
if
not
os
.
path
.
exists
(
folder
):
print
(
"Folder not found."
)
return
files
=
[
f
for
f
in
os
.
listdir
(
folder
)
if
f
.
endswith
(
".csv"
)]
if
not
files
:
print
(
" No CSV files found."
)
return
print
(
"Available files:"
)
for
i
,
f
in
enumerate
(
files
,
1
):
print
(
f
"{i}. {f}"
)
selected
=
input
(
"Enter the numbers of the files you want to import (e.g., 1 3 4): "
)
.
split
()
selected_files
=
[
files
[
int
(
i
)
-
1
]
for
i
in
selected
]
grade
=
input
(
"Enter grade manually (e.g., Grade 5): "
)
.
strip
()
subject
=
input
(
"Enter subject manually (default: Science): "
)
.
strip
()
or
"Science"
conn
=
get_db_connection
()
try
:
for
f
in
selected_files
:
file_path
=
os
.
path
.
join
(
folder
,
f
)
insert_lessons_from_csv
(
file_path
,
conn
,
grade
,
subject
)
with
conn
.
cursor
()
as
cur
:
cur
.
execute
(
"SELECT COUNT(*) FROM lessons;"
)
total
=
cur
.
fetchone
()[
0
]
print
(
f
" Total rows in lessons table: {total}"
)
finally
:
conn
.
close
()
print
(
" Connection closed."
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment