Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
498fa39d
Commit
498fa39d
authored
Oct 22, 2025
by
SalmaMohammedHamedMustafa
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
inegrate pdf uploed
parent
a558073c
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1376 additions
and
18 deletions
+1376
-18
docker-compose.yml
self_hosted_env/docker-compose.yml
+6
-1
Dockerfile
self_hosted_env/voice_agent/Dockerfile
+5
-1
main.py
self_hosted_env/voice_agent/main.py
+177
-13
process_pdf_pipline.py
self_hosted_env/voice_agent/process_pdf_pipline.py
+885
-0
requirements.txt
self_hosted_env/voice_agent/requirements.txt
+11
-1
__init__.py
self_hosted_env/voice_agent/services/__init__.py
+2
-1
data_ingestion_service.py
...hosted_env/voice_agent/services/data_ingestion_service.py
+77
-0
pgvector_service.py
self_hosted_env/voice_agent/services/pgvector_service.py
+41
-1
curriculum_PDF_uploader.html
...osted_env/voice_agent/static/curriculum_PDF_uploader.html
+172
-0
voice_agent.tar
self_hosted_env/voice_agent/voice_agent.tar
+0
-0
No files found.
self_hosted_env/docker-compose.yml
View file @
498fa39d
...
@@ -61,8 +61,13 @@ services:
...
@@ -61,8 +61,13 @@ services:
DB_HOST
:
"
${DB_HOST}"
DB_HOST
:
"
${DB_HOST}"
TTS_PROVIDER
:
"
${TTS_PROVIDER}"
TTS_PROVIDER
:
"
${TTS_PROVIDER}"
CUSTOM_TTS_URL
:
"
${CUSTOM_TTS_URL}"
CUSTOM_TTS_URL
:
"
${CUSTOM_TTS_URL}"
GEMINI_API_KEY
:
"
${GEMINI_API_KEY}"
REDIS_HOST
:
"
redis"
REDIS_HOST
:
"
redis"
REDIS_PORT
:
"
6379"
REDIS_PORT
:
"
6379"
volumes
:
-
./voice_agent/embeddings:/app/embeddings
-
./voice_agent/All_Curriculums_grouped.json:/app/All_Curriculums_grouped.json
depends_on
:
depends_on
:
-
minio
-
minio
-
postgres
-
postgres
...
@@ -71,4 +76,4 @@ services:
...
@@ -71,4 +76,4 @@ services:
volumes
:
volumes
:
pgdata
:
pgdata
:
miniodata
:
miniodata
:
redisdata
:
redisdata
:
\ No newline at end of file
self_hosted_env/voice_agent/Dockerfile
View file @
498fa39d
...
@@ -3,8 +3,12 @@ FROM python:3.10-slim
...
@@ -3,8 +3,12 @@ FROM python:3.10-slim
WORKDIR
/app
WORKDIR
/app
# Install postgresql-client for pg_isready
# Install postgresql-client for pg_isready
RUN
apt-get update
&&
apt-get
install
-y
\
RUN
apt-get update
&&
apt-get
install
-y
--no-install-recommends
\
postgresql-client
\
postgresql-client
\
tesseract-ocr
\
tesseract-ocr-ara
\
libtesseract-dev
\
poppler-utils
\
&&
rm
-rf
/var/lib/apt/lists/
*
&&
rm
-rf
/var/lib/apt/lists/
*
# Install Python dependencies
# Install Python dependencies
...
...
self_hosted_env/voice_agent/main.py
View file @
498fa39d
This diff is collapsed.
Click to expand it.
self_hosted_env/voice_agent/process_pdf_pipline.py
0 → 100644
View file @
498fa39d
This diff is collapsed.
Click to expand it.
self_hosted_env/voice_agent/requirements.txt
View file @
498fa39d
...
@@ -12,4 +12,14 @@ pandas
...
@@ -12,4 +12,14 @@ pandas
python-dotenv
python-dotenv
httpx
httpx
langdetect
langdetect
redis
redis
\ No newline at end of file
pdf2image
pytesseract
fuzzywuzzy
python-Levenshtein
tqdm
google-generativeai
pydantic
opencv-python-headless
numpy
Pillow
\ No newline at end of file
self_hosted_env/voice_agent/services/__init__.py
View file @
498fa39d
...
@@ -9,4 +9,5 @@ from .pgvector_service import PGVectorService
...
@@ -9,4 +9,5 @@ from .pgvector_service import PGVectorService
from
.chat_database_service
import
ChatDatabaseService
from
.chat_database_service
import
ChatDatabaseService
from
.connection_pool
import
ConnectionPool
from
.connection_pool
import
ConnectionPool
from
.pedagogy_service
import
PedagogyService
from
.pedagogy_service
import
PedagogyService
from
.segmentation_service
import
LanguageSegmentationService
from
.segmentation_service
import
LanguageSegmentationService
\ No newline at end of file
from
.data_ingestion_service
import
DataIngestionService
\ No newline at end of file
self_hosted_env/voice_agent/services/data_ingestion_service.py
0 → 100644
View file @
498fa39d
import
psycopg2
import
pandas
as
pd
import
json
from
pgvector.psycopg2
import
register_vector
from
typing
import
Dict
class
DataIngestionService
:
"""A service dedicated to inserting new curriculum data into the database."""
def
__init__
(
self
,
pool_handler
):
self
.
pool_handler
=
pool_handler
def
ingest_curriculum_structure
(
self
,
curriculum_json_data
:
Dict
):
"""
Takes parsed JSON data for curriculum structure and inserts it into the DB.
This logic is adapted from your curriculum_structure.py script.
"""
print
(
"Inserting curriculum structure data..."
)
# Use the connection pool for thread safety
with
self
.
pool_handler
.
get_connection
()
as
conn
:
with
conn
.
cursor
()
as
cur
:
for
(
grade
,
is_arabic
,
subject
),
curriculum
in
curriculum_json_data
.
items
():
try
:
cur
.
execute
(
"""
INSERT INTO curriculum_structure (grade, is_arabic, subject, curriculum_data)
VALUES (
%
s,
%
s,
%
s,
%
s)
ON CONFLICT (grade, is_arabic, subject)
DO UPDATE SET curriculum_data = EXCLUDED.curriculum_data;
"""
,
(
grade
,
is_arabic
,
subject
,
json
.
dumps
(
curriculum
))
)
print
(
f
"✅ Ingested structure for Grade {grade} ({'Arabic' if is_arabic else 'English'})"
)
except
Exception
as
e
:
print
(
f
"❌ Error ingesting structure for Grade {grade}: {e}"
)
conn
.
rollback
()
# Rollback on error for this item
conn
.
commit
()
print
(
"Curriculum structure ingestion complete."
)
def
ingest_embeddings_from_csv
(
self
,
df
:
pd
.
DataFrame
):
"""
Takes a pandas DataFrame of embeddings and inserts it into the DB.
This logic is adapted from your insert_csv_embeddings.py script.
"""
print
(
"Inserting embeddings from CSV data..."
)
insert_query
=
"""
INSERT INTO educational_chunks
(grade, subject, unit, concept, lesson, from_page, to_page, chunk_index, chunk_text, is_arabic, embedding)
VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s)
"""
records_to_insert
=
[]
for
_
,
row
in
df
.
iterrows
():
try
:
# Assuming the intern's code provides the embedding as a list/string
embedding
=
json
.
loads
(
row
[
"Embedding"
])
if
isinstance
(
row
[
"Embedding"
],
str
)
else
row
[
"Embedding"
]
records_to_insert
.
append
((
int
(
row
[
"Grade"
]),
row
[
"Subject"
],
row
.
get
(
"Unit"
),
row
.
get
(
"Concept"
),
row
.
get
(
"Lesson"
),
int
(
row
[
"From page"
]),
int
(
row
[
"To page"
]),
int
(
row
[
"Chunk index"
]),
row
[
"Chunk text"
],
bool
(
row
[
"Is Arabic"
]),
embedding
))
except
Exception
as
e
:
print
(
f
"Skipping row due to malformed data: {e}"
)
if
not
records_to_insert
:
print
(
"No valid records to insert."
)
return
with
self
.
pool_handler
.
get_connection
()
as
conn
:
with
conn
.
cursor
()
as
cur
:
# Use execute_batch for efficient insertion
psycopg2
.
extras
.
execute_batch
(
cur
,
insert_query
,
records_to_insert
)
conn
.
commit
()
print
(
f
"✅ Ingested {len(records_to_insert)} embedding chunks successfully."
)
\ No newline at end of file
self_hosted_env/voice_agent/services/pgvector_service.py
View file @
498fa39d
...
@@ -483,4 +483,44 @@ class PGVectorService:
...
@@ -483,4 +483,44 @@ class PGVectorService:
FROM curriculum_structure
FROM curriculum_structure
ORDER BY grade, is_arabic, subject;
ORDER BY grade, is_arabic, subject;
"""
)
"""
)
return
cur
.
fetchall
()
return
cur
.
fetchall
()
\ No newline at end of file
def
verify_recent_insertions
(
self
,
limit
:
int
=
5
):
"""
Fetches and prints the most recently added educational chunks
to verify a successful ingestion.
"""
print
(
"
\n
"
+
"="
*
50
)
print
(
"🔍 Verifying recent embeddings in the database..."
)
print
(
"="
*
50
)
try
:
with
self
.
pool_handler
.
get_connection
()
as
conn
:
with
conn
.
cursor
(
cursor_factory
=
RealDictCursor
)
as
cur
:
# Fetches the 5 rows with the highest 'id' (most recent)
cur
.
execute
(
"""
SELECT id, grade, subject, unit, concept, chunk_text, is_arabic
FROM educational_chunks
ORDER BY id DESC
LIMIT
%
s;
"""
,
(
limit
,)
)
results
=
cur
.
fetchall
()
if
not
results
:
print
(
"❌ No data found in the 'educational_chunks' table."
)
return
print
(
f
"✅ Found {len(results)} recent records. Here they are:
\n
"
)
for
row
in
results
:
print
(
f
" - ID: {row['id']}, Grade: {row['grade']}, Arabic: {row['is_arabic']}"
)
print
(
f
" Unit: {row['unit']}"
)
print
(
f
" Concept: {row['concept']}"
)
print
(
f
" Text: '{row['chunk_text'][:80]}...'
\n
"
)
print
(
"="
*
50
)
except
Exception
as
e
:
print
(
f
"❌ Database verification failed: {e}"
)
\ No newline at end of file
self_hosted_env/voice_agent/static/curriculum_PDF_uploader.html
0 → 100644
View file @
498fa39d
<!DOCTYPE html>
<html
lang=
"en"
>
<head>
<meta
charset=
"UTF-8"
>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1.0"
>
<title>
Curriculum PDF Uploader
</title>
<style>
body
{
font-family
:
-apple-system
,
BlinkMacSystemFont
,
"Segoe UI"
,
Roboto
,
sans-serif
;
max-width
:
700px
;
margin
:
40px
auto
;
padding
:
20px
;
background-color
:
#f9f9f9
;
color
:
#333
;
line-height
:
1.6
;
}
.container
{
background
:
white
;
padding
:
30px
;
border-radius
:
8px
;
box-shadow
:
0
4px
15px
rgba
(
0
,
0
,
0
,
0.1
);
}
h1
{
text-align
:
center
;
color
:
#2c3e50
;
}
input
[
type
=
"file"
]
{
display
:
block
;
margin-bottom
:
20px
;
border
:
2px
dashed
#ccc
;
padding
:
20px
;
border-radius
:
5px
;
width
:
95%
;
text-align
:
center
;
cursor
:
pointer
;
}
input
[
type
=
"file"
]
::file-selector-button
{
padding
:
10px
15px
;
border-radius
:
5px
;
border
:
none
;
background-color
:
#3498db
;
color
:
white
;
cursor
:
pointer
;
transition
:
background-color
0.2s
;
}
input
[
type
=
"file"
]
::file-selector-button:hover
{
background-color
:
#2980b9
;
}
button
{
display
:
block
;
width
:
100%
;
padding
:
12px
;
font-size
:
16px
;
font-weight
:
bold
;
background
:
#27ae60
;
color
:
white
;
border
:
none
;
border-radius
:
5px
;
cursor
:
pointer
;
transition
:
background
0.2s
;
}
button
:hover
{
background
:
#229954
;
}
button
:disabled
{
background
:
#95a5a6
;
cursor
:
not-allowed
;
}
.status
{
margin-top
:
20px
;
padding
:
15px
;
border-radius
:
5px
;
font-weight
:
bold
;
display
:
none
;
/* Hidden by default */
}
.status.success
{
background-color
:
#d4edda
;
color
:
#155724
;
border
:
1px
solid
#c3e6cb
;
}
.status.error
{
background-color
:
#f8d7da
;
color
:
#721c24
;
border
:
1px
solid
#f5c6cb
;
}
.status.processing
{
background-color
:
#e7f3ff
;
color
:
#004085
;
border
:
1px
solid
#b3d9ff
;
}
pre
{
background-color
:
#ecf0f1
;
padding
:
15px
;
border-radius
:
5px
;
white-space
:
pre-wrap
;
word-wrap
:
break-word
;
}
</style>
</head>
<body>
<div
class=
"container"
>
<h1>
Curriculum PDF Uploader
</h1>
<div
class=
"form-group"
style=
"margin-bottom: 20px;"
>
<label
for=
"gradeInput"
style=
"display: block; margin-bottom: 5px; font-weight: bold;"
>
Grade:
</label>
<input
type=
"number"
id=
"gradeInput"
value=
"4"
style=
"width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;"
>
</div>
<div
class=
"form-group"
style=
"margin-bottom: 20px;"
>
<label
for=
"subjectInput"
style=
"display: block; margin-bottom: 5px; font-weight: bold;"
>
Subject:
</label>
<input
type=
"text"
id=
"subjectInput"
value=
"Science"
style=
"width: 98%; padding: 10px; border: 1px solid #ccc; border-radius: 5px;"
>
</div>
<input
type=
"file"
id=
"pdfFile"
accept=
".pdf"
>
<button
id=
"uploadButton"
>
Upload and Process Curriculum
</button>
<div
id=
"status"
></div>
<pre
id=
"response"
style=
"display:none;"
></pre>
</div>
<script>
const
API_URL
=
'http://localhost:8000/process-curriculum'
;
const
pdfFileInput
=
document
.
getElementById
(
'pdfFile'
);
const
uploadButton
=
document
.
getElementById
(
'uploadButton'
);
const
statusDiv
=
document
.
getElementById
(
'status'
);
const
responsePre
=
document
.
getElementById
(
'response'
);
const
gradeInput
=
document
.
getElementById
(
'gradeInput'
);
const
subjectInput
=
document
.
getElementById
(
'subjectInput'
);
//
<--
Get
the
new
subject
field
uploadButton
.
addEventListener
(
'click'
,
async
()
=>
{
const
selectedFile
=
pdfFileInput
.
files
[
0
];
const
grade
=
gradeInput
.
value
;
const
subject
=
subjectInput
.
value
;
//
<--
Get
the
subject
value
// --- Update validation ---
if
(
!
selectedFile
)
{
showStatus
(
'Please select a PDF file first.'
,
'error'
);
return
;
}
if
(
!
grade
)
{
showStatus
(
'Please enter a grade.'
,
'error'
);
return
;
}
if
(
!
subject
)
{
showStatus
(
'Please enter a subject.'
,
'error'
);
return
;
}
const
formData
=
new
FormData
();
formData
.
append
(
'file'
,
selectedFile
);
formData
.
append
(
'grade'
,
grade
);
formData
.
append
(
'subject'
,
subject
);
// 3. Update UI to show processing state
showStatus
(
'Uploading and starting background processing...'
,
'processing'
);
uploadButton
.
disabled
=
true
;
responsePre
.
style
.
display
=
'none'
;
try
{
// 4. Send the file AND grade to the API
const
response
=
await
fetch
(
API_URL
,
{
method
:
'POST'
,
body
:
formData
,
});
const
responseData
=
await
response
.
json
();
// 5. Handle the server's response
if
(
!
response
.
ok
)
{
throw
new
Error
(
responseData
.
detail
||
`Server error:
${
response
.
statusText
}
`
);
}
showStatus
(
'Success! The server has started processing your file in the background.'
,
'success'
);
responsePre
.
textContent
=
JSON
.
stringify
(
responseData
,
null
,
2
);
responsePre
.
style
.
display
=
'block'
;
}
catch
(
error
)
{
showStatus
(
`An error occurred:
${
error
.
message
}
`
,
'error'
);
}
finally
{
// 6. Re-enable the button
uploadButton
.
disabled
=
false
;
}
});
// Helper function to show status messages
function
showStatus
(
message
,
type
)
{
statusDiv
.
textContent
=
message
;
statusDiv
.
className
=
`status
${
type
}
`
;
statusDiv
.
style
.
display
=
'block'
;
}
</script>
</body>
</html>
\ No newline at end of file
self_hosted_env/voice_agent/voice_agent.tar
deleted
100644 → 0
View file @
a558073c
File deleted
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment