Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
AI Tutor
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Salma Mohammed Hamed
AI Tutor
Commits
6c10aac1
Commit
6c10aac1
authored
Oct 26, 2025
by
SalmaMohammedHamedMustafa
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
mcq AI builder
parent
8d42c50c
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
675 additions
and
91 deletions
+675
-91
main.py
self_hosted_env/voice_agent/main.py
+71
-1
process_pdf_pipline.py
self_hosted_env/voice_agent/process_pdf_pipline.py
+114
-89
agent_service.py
self_hosted_env/voice_agent/services/agent_service.py
+130
-0
pgvector_service.py
self_hosted_env/voice_agent/services/pgvector_service.py
+57
-1
setup_mcq_table.py
self_hosted_env/voice_agent/setup_mcq_table.py
+66
-0
start.sh
self_hosted_env/voice_agent/start.sh
+2
-0
msq_test.py
test_cases/msq_test.py
+235
-0
No files found.
self_hosted_env/voice_agent/main.py
View file @
6c10aac1
import
os
import
os
import
shutil
import
shutil
from
fastapi
import
FastAPI
,
UploadFile
,
File
,
Form
,
HTTPException
,
Request
,
BackgroundTasks
from
fastapi
import
FastAPI
,
UploadFile
,
File
,
Form
,
HTTPException
,
Request
,
BackgroundTasks
,
logger
from
fastapi.middleware.cors
import
CORSMiddleware
from
fastapi.middleware.cors
import
CORSMiddleware
from
fastapi.responses
import
FileResponse
,
Response
from
fastapi.responses
import
FileResponse
,
Response
from
fastapi.staticfiles
import
StaticFiles
from
fastapi.staticfiles
import
StaticFiles
...
@@ -12,6 +12,7 @@ from pathlib import Path
...
@@ -12,6 +12,7 @@ from pathlib import Path
import
tempfile
import
tempfile
import
json
import
json
import
pandas
as
pd
import
pandas
as
pd
import
logging
from
curriculum_structure
import
convert_json_to_db_format
from
curriculum_structure
import
convert_json_to_db_format
from
process_pdf_pipline
import
run_full_pipeline
from
process_pdf_pipline
import
run_full_pipeline
...
@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI):
...
@@ -96,6 +97,7 @@ async def lifespan(app: FastAPI):
def
create_app
()
->
FastAPI
:
def
create_app
()
->
FastAPI
:
# Connect the lifespan manager to your FastAPI app instance
# Connect the lifespan manager to your FastAPI app instance
app
=
FastAPI
(
title
=
"Unified Chat API with Local Agent"
,
lifespan
=
lifespan
)
app
=
FastAPI
(
title
=
"Unified Chat API with Local Agent"
,
lifespan
=
lifespan
)
logger
=
logging
.
getLogger
(
"uvicorn.error"
)
# Fixed CORS configuration for CapRover
# Fixed CORS configuration for CapRover
app
.
add_middleware
(
app
.
add_middleware
(
...
@@ -336,6 +338,74 @@ def create_app() -> FastAPI:
...
@@ -336,6 +338,74 @@ def create_app() -> FastAPI:
return
{
"status"
:
"processing_started"
,
"message"
:
"The curriculum is being processed in the background."
}
return
{
"status"
:
"processing_started"
,
"message"
:
"The curriculum is being processed in the background."
}
@
app
.
post
(
"/mcq/generate"
)
async
def
generate_mcqs_handler
(
request
:
Request
,
grade
:
int
=
Form
(
...
),
subject
:
str
=
Form
(
...
),
unit
:
str
=
Form
(
...
),
concept
:
str
=
Form
(
...
),
count
:
int
=
Form
(
5
),
is_arabic
:
bool
=
Form
(
False
)
):
"""
Generates and stores a new set of MCQs for a specific topic.
"""
container
=
request
.
app
.
state
.
container
try
:
generated_questions
=
container
.
agent_service
.
generate_and_store_mcqs
(
grade
=
grade
,
subject
=
subject
,
unit
=
unit
,
concept
=
concept
,
num_questions
=
count
,
is_arabic
=
is_arabic
)
return
{
"status"
:
"success"
,
"message"
:
f
"Successfully generated and stored {len(generated_questions)} MCQs."
,
"questions"
:
generated_questions
}
except
HTTPException
as
e
:
raise
e
# Re-raise FastAPI specific exceptions
except
Exception
as
e
:
logger
.
error
(
f
"Error in generate_mcqs_handler: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
str
(
e
))
@
app
.
get
(
"/mcq"
)
async
def
get_mcqs_handler
(
request
:
Request
,
grade
:
int
,
subject
:
str
,
unit
:
str
,
concept
:
str
,
is_arabic
:
bool
,
# Make limit optional. If not provided, it will be None.
limit
:
Optional
[
int
]
=
None
):
"""
Retrieves existing MCQs for a specific topic and language from the database.
If no limit is provided, retrieves all questions.
"""
container
=
request
.
app
.
state
.
container
try
:
questions
=
container
.
agent_service
.
pgvector
.
get_mcqs
(
grade
=
grade
,
subject
=
subject
,
unit
=
unit
,
concept
=
concept
,
is_arabic
=
is_arabic
,
limit
=
limit
# Pass the limit (which could be None)
)
return
{
"status"
:
"success"
,
"count"
:
len
(
questions
),
"questions"
:
questions
}
except
Exception
as
e
:
logger
.
error
(
f
"Error in get_mcqs_handler: {e}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
str
(
e
))
@
app
.
options
(
"/get-audio-response"
)
@
app
.
options
(
"/get-audio-response"
)
async
def
audio_response_options
():
async
def
audio_response_options
():
"""Handle preflight CORS requests for audio response endpoint"""
"""Handle preflight CORS requests for audio response endpoint"""
...
...
self_hosted_env/voice_agent/process_pdf_pipline.py
View file @
6c10aac1
...
@@ -19,7 +19,7 @@ import json
...
@@ -19,7 +19,7 @@ import json
import
numpy
as
np
import
numpy
as
np
from
openai
import
OpenAI
from
openai
import
OpenAI
from
typing
import
List
,
Dict
,
Union
,
Any
,
Optional
,
Tuple
from
typing
import
List
,
Dict
,
Union
,
Any
,
Optional
,
Tuple
from
pydantic
import
BaseModel
,
Field
from
pydantic
import
BaseModel
,
Field
,
ValidationError
import
google.generativeai
as
genai
import
google.generativeai
as
genai
from
google.generativeai
import
types
from
google.generativeai
import
types
import
csv
import
csv
...
@@ -27,7 +27,7 @@ import logging
...
@@ -27,7 +27,7 @@ import logging
# Configure logging
# Configure logging
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'
%(asctime)
s -
%(levelname)
s -
%(message)
s'
)
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
'
%(asctime)
s -
%(levelname)
s -
%(message)
s'
)
#
You can
change level=logging.DEBUG for more verbose output during debugging
# change level=logging.DEBUG for more verbose output during debugging
# =========================
# =========================
# 1. Initialization and Setup
# 1. Initialization and Setup
...
@@ -432,9 +432,6 @@ Output must be ONLY a valid JSON object (no markdown, no extra text) conforming
...
@@ -432,9 +432,6 @@ Output must be ONLY a valid JSON object (no markdown, no extra text) conforming
)
)
initial_json_string
=
resp_initial
.
text
.
strip
()
initial_json_string
=
resp_initial
.
text
.
strip
()
initial_data
=
json
.
loads
(
initial_json_string
)
initial_data
=
json
.
loads
(
initial_json_string
)
# Add internal fields
initial_data
[
'_page_count'
]
=
pdf_total_pages
initial_data
[
'_source_file'
]
=
"N/A"
# Placeholder
book_structure
=
BookStructure
(
**
initial_data
)
book_structure
=
BookStructure
(
**
initial_data
)
logging
.
info
(
"✅ Gemini initial structure extraction successful."
)
logging
.
info
(
"✅ Gemini initial structure extraction successful."
)
...
@@ -707,23 +704,54 @@ class EmbeddingProcessor:
...
@@ -707,23 +704,54 @@ class EmbeddingProcessor:
logging
.
error
(
f
"❌ Error generating embedding for text: '{text[:100]}...' - {str(e)}"
,
exc_info
=
True
)
logging
.
error
(
f
"❌ Error generating embedding for text: '{text[:100]}...' - {str(e)}"
,
exc_info
=
True
)
return
[
0.0
]
*
Config
.
OPENAI_EMBEDDING_DIMENSION
return
[
0.0
]
*
Config
.
OPENAI_EMBEDDING_DIMENSION
def
detect_arabic_text
(
self
,
text
:
str
)
->
bool
:
def
find_context_for_page
(
self
,
page_num
:
int
,
structured_data
:
Dict
[
str
,
Any
])
->
Optional
[
Dict
[
str
,
str
]]:
"""Simple detection of Arabic text based on character ranges."""
"""
if
not
text
or
pd
.
isna
(
text
):
Robust hierarchical lookup. It first tries to find a specific Lesson,
return
False
then falls back to finding a Concept, and finally a Unit.
text
=
str
(
text
)
"""
arabic_chars
=
0
# --- Level 1: Try to find an exact LESSON match ---
total_chars
=
0
for
unit_data
in
structured_data
.
get
(
"units"
,
[]):
for
char
in
text
:
unit_name
=
unit_data
.
get
(
"unit_name"
,
"Unknown Unit"
)
if
char
.
strip
():
for
concept_data
in
unit_data
.
get
(
"concepts"
,
[]):
total_chars
+=
1
concept_name
=
concept_data
.
get
(
"concept_name"
,
"Unknown Concept"
)
if
(
'
\u0600
'
<=
char
<=
'
\u06FF
'
)
or
(
'
\u0750
'
<=
char
<=
'
\u077F
'
)
or
\
for
lesson_data
in
concept_data
.
get
(
"lessons"
,
[]):
(
'
\u08A0
'
<=
char
<=
'
\u08FF
'
)
or
(
'
\uFB50
'
<=
char
<=
'
\uFDFF
'
)
or
\
pages
=
lesson_data
.
get
(
"pages"
)
(
'
\uFE70
'
<=
char
<=
'
\uFEFF
'
):
if
pages
and
pages
.
get
(
'start_page'
,
0
)
<=
page_num
<=
pages
.
get
(
'end_page'
,
0
):
arabic_chars
+=
1
return
{
return
total_chars
>
0
and
(
arabic_chars
/
total_chars
)
>
Config
.
ARABIC_CHAR_THRESHOLD
'Unit'
:
unit_name
,
'Concept'
:
concept_name
,
def
process_structured_data_for_embeddings
(
'Lesson'
:
lesson_data
.
get
(
"lesson_name"
,
"Unknown Lesson"
)
}
# --- Level 2: If no lesson matched, try a CONCEPT match ---
# This perfectly handles your scenario where Concept ranges are good but Lessons are bad.
for
unit_data
in
structured_data
.
get
(
"units"
,
[]):
unit_name
=
unit_data
.
get
(
"unit_name"
,
"Unknown Unit"
)
for
concept_data
in
unit_data
.
get
(
"concepts"
,
[]):
pages
=
concept_data
.
get
(
"pages"
)
# We check if the page is within the CONCEPT'S overall range
if
pages
and
pages
.
get
(
'start_page'
,
0
)
<=
page_num
<=
pages
.
get
(
'end_page'
,
0
):
return
{
'Unit'
:
unit_name
,
'Concept'
:
concept_data
.
get
(
"concept_name"
,
"Unknown Concept"
),
# We label it as general content for this concept since it didn't match a specific lesson
'Lesson'
:
f
"General content for {concept_data.get('concept_name', 'Concept')}"
}
# --- Level 3: If still nothing, try a UNIT match ---
for
unit_data
in
structured_data
.
get
(
"units"
,
[]):
pages
=
unit_data
.
get
(
"pages"
)
if
pages
and
pages
.
get
(
'start_page'
,
0
)
<=
page_num
<=
pages
.
get
(
'end_page'
,
0
):
return
{
'Unit'
:
unit_data
.
get
(
"unit_name"
,
"Unknown Unit"
),
'Concept'
:
"General Unit Content"
,
'Lesson'
:
f
"General content for {unit_data.get('unit_name', 'Unit')}"
}
return
None
# Page completely outside any known structure
def
process_all_pages_for_embeddings_gapless
(
self
,
self
,
structured_data
:
Dict
[
str
,
Any
],
structured_data
:
Dict
[
str
,
Any
],
page_texts
:
Dict
[
int
,
str
],
page_texts
:
Dict
[
int
,
str
],
...
@@ -733,75 +761,70 @@ class EmbeddingProcessor:
...
@@ -733,75 +761,70 @@ class EmbeddingProcessor:
output_csv_path
:
str
output_csv_path
:
str
):
):
"""
"""
Takes Gemini-extracted data and generates embeddings.
Processes EVERY page from the OCR output to guarantee 100
%
content coverage.
This version has a more robust repair logic for missing page numbers.
It uses the Gemini structure as a lookup to label pages, and applies the last
known label to any pages that fall in gaps within the structure.
"""
"""
if
not
structured_data
or
not
structured_data
.
get
(
"units"
)
:
if
not
page_texts
:
logging
.
warning
(
"❌ No
structured units found for embedding
."
)
logging
.
warning
(
"❌ No
page texts available to process for embeddings
."
)
return
return
logging
.
info
(
f
"
Generating robust embeddings for grade {grade}, {lang} content..
."
)
logging
.
info
(
f
"
🚀 Starting GAPLESS embedding generation for Grade {grade}, Subject {subject}
."
)
output_rows
=
[]
output_rows
=
[]
is_arabic
=
(
lang
==
"arabic"
)
is_arabic
=
(
lang
==
"arabic"
)
total_pages
=
max
(
page_texts
.
keys
())
if
page_texts
else
0
last_known_page
=
0
# Initialize a fallback context for pages that are not in the structure (gaps)
last_known_context
=
{
for
unit_data
in
tqdm
(
structured_data
.
get
(
"units"
,
[]),
desc
=
"Processing Units for Embeddings"
):
'Unit'
:
'Uncategorized'
,
unit_name
=
unit_data
.
get
(
"unit_name"
,
"Unknown Unit"
)
'Concept'
:
'Uncategorized'
,
'Lesson'
:
'Uncategorized'
for
concept_data
in
unit_data
.
get
(
"concepts"
,
[]):
}
concept_name
=
concept_data
.
get
(
"concept_name"
,
"Unknown Concept"
)
# The main loop iterates through ALL pages, guaranteeing no gaps.
for
lesson_data
in
concept_data
.
get
(
"lessons"
,
[]):
for
page_num
in
tqdm
(
sorted
(
page_texts
.
keys
()),
desc
=
"Embedding All Pages"
):
lesson_name
=
lesson_data
.
get
(
"lesson_name"
,
"Unknown Lesson"
)
page_text
=
page_texts
[
page_num
]
pages
=
lesson_data
.
get
(
"pages"
)
if
not
page_text
.
strip
():
continue
# Skip empty pages
start_page
,
end_page
=
0
,
0
# Find the context for the current page from Gemini's structure
if
pages
and
pages
.
get
(
'start_page'
,
0
)
>
0
and
pages
.
get
(
'end_page'
,
0
)
>
0
:
context
=
self
.
find_context_for_page
(
page_num
,
structured_data
)
# This is the "happy path": Gemini gave us valid pages
start_page
,
end_page
=
pages
[
'start_page'
],
pages
[
'end_page'
]
if
context
:
else
:
# If found, this is our new "last known" good context
# --- THIS IS THE NEW, SMARTER REPAIR LOGIC ---
last_known_context
=
context
# Suggest a start page based on the last known page
current_context
=
context
suggested_start
=
last_known_page
+
1
else
:
# If not found (page is in a gap), use the last valid context we saw.
# If our suggestion is already past the end of the book, we can't continue.
logging
.
warning
(
f
" -> Page {page_num} not in structure. Applying last known context: '{last_known_context['Lesson']}'"
)
if
suggested_start
>
total_pages
:
current_context
=
last_known_context
logging
.
warning
(
f
" -> Skipping Lesson '{lesson_name}': Suggested start ({suggested_start}) is beyond total pages ({total_pages})."
)
continue
# Move to the next lesson
# Chunk the text of the single page
for
chunk_idx
,
chunk_text
in
enumerate
(
self
.
chunk_text
(
page_text
,
is_arabic
)):
start_page
=
suggested_start
if
not
chunk_text
:
# Ensure the end page is at least the start page, and not past the end of the book.
continue
end_page
=
min
(
start_page
+
4
,
total_pages
)
logging
.
warning
(
f
" -> Repairing Lesson '{lesson_name}': Applying default pages [{start_page}-{end_page}]"
)
output_rows
.
append
({
'Grade'
:
grade
,
# Final safety check, although the logic above should prevent this.
'Subject'
:
subject
,
if
start_page
>
end_page
:
'Unit'
:
current_context
[
'Unit'
],
logging
.
error
(
f
" -> CRITICAL SKIP for Lesson '{lesson_name}': Invalid final page range [{start_page}-{end_page}]"
)
'Concept'
:
current_context
[
'Concept'
],
continue
'Lesson'
:
current_context
[
'Lesson'
],
'From page'
:
page_num
,
# Metadata is now per-page
# Update the tracker for the next iteration with a valid page number
'To page'
:
page_num
,
last_known_page
=
end_page
'Chunk index'
:
chunk_idx
,
'Chunk text'
:
chunk_text
,
lesson_full_text
=
" "
.
join
([
page_texts
.
get
(
p
,
""
)
for
p
in
range
(
start_page
,
end_page
+
1
)])
'Is Arabic'
:
is_arabic
,
'Embedding'
:
json
.
dumps
(
self
.
get_embedding
(
chunk_text
))
for
chunk_idx
,
chunk_text
in
enumerate
(
self
.
chunk_text
(
lesson_full_text
,
is_arabic
)):
})
if
not
chunk_text
:
continue
output_rows
.
append
({
'Grade'
:
grade
,
'Subject'
:
subject
,
'Unit'
:
unit_name
,
'Concept'
:
concept_name
,
'Lesson'
:
lesson_name
,
'From page'
:
start_page
,
'To page'
:
end_page
,
'Chunk index'
:
chunk_idx
,
'Chunk text'
:
chunk_text
,
'Is Arabic'
:
is_arabic
,
'Embedding'
:
json
.
dumps
(
self
.
get_embedding
(
chunk_text
))
})
df
=
pd
.
DataFrame
(
output_rows
)
df
=
pd
.
DataFrame
(
output_rows
)
df
.
to_csv
(
output_csv_path
,
index
=
False
,
quoting
=
csv
.
QUOTE_MINIMAL
,
encoding
=
"utf-8-sig"
)
df
.
to_csv
(
output_csv_path
,
index
=
False
,
quoting
=
csv
.
QUOTE_MINIMAL
,
encoding
=
"utf-8-sig"
)
logging
.
info
(
f
"✅
Embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated
)"
)
logging
.
info
(
f
"✅
Gapless embeddings saved to: {output_csv_path} ({len(output_rows)} chunks generated from {len(page_texts)} pages
)"
)
# process_pdf_pipline.py
# =========================
# 8. Main Pipeline Function
# =========================
def
repair_and_enrich_structure
(
gemini_output
:
Dict
,
lang
:
str
,
grade
:
str
,
total_pages
:
int
)
->
Dict
[
str
,
Any
]:
def
repair_and_enrich_structure
(
gemini_output
:
Dict
,
lang
:
str
,
grade
:
str
,
total_pages
:
int
)
->
Dict
[
str
,
Any
]:
logging
.
warning
(
"🔧 Sanitizing and repairing Gemini's output..."
)
logging
.
warning
(
"🔧 Sanitizing and repairing Gemini's output..."
)
...
@@ -831,16 +854,15 @@ def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, tota
...
@@ -831,16 +854,15 @@ def repair_and_enrich_structure(gemini_output: Dict, lang: str, grade: str, tota
return
gemini_output
return
gemini_output
# =========================
# 8. Main Pipeline Function
# =========================
def
run_full_pipeline
(
pdf_path
:
str
,
grade
:
int
,
subject
:
str
,
output_json_path
:
str
,
output_embeddings_csv_path
:
str
,
remove_lessons
:
bool
=
False
):
def
run_full_pipeline
(
pdf_path
:
str
,
grade
:
int
,
subject
:
str
,
output_json_path
:
str
,
output_embeddings_csv_path
:
str
,
remove_lessons
:
bool
=
False
):
logging
.
info
(
f
"
\n
--- Starting Pipeline for {pdf_path} (Grade: {grade}, Subject: {subject}) ---"
)
logging
.
info
(
f
"
\n
--- Starting Pipeline for {pdf_path} (Grade: {grade}, Subject: {subject}) ---"
)
gemini_raw_output
=
{}
try
:
try
:
page_texts
,
lang
,
_
,
tracked_titles
=
process_pdf_to_text
(
pdf_path
)
page_texts
,
lang
,
_
,
tracked_titles
=
process_pdf_to_text
(
pdf_path
)
if
not
page_texts
:
return
if
not
page_texts
:
logging
.
critical
(
"❌ CRITICAL: No text could be extracted from the PDF. Aborting."
)
return
pdf_total_pages
=
max
(
page_texts
.
keys
())
pdf_total_pages
=
max
(
page_texts
.
keys
())
toc_contents
=
extract_toc_pages_from_first_n
(
page_texts
,
lang
)
toc_contents
=
extract_toc_pages_from_first_n
(
page_texts
,
lang
)
...
@@ -871,7 +893,9 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
...
@@ -871,7 +893,9 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
if
openai_client
:
if
openai_client
:
embedding_processor
=
EmbeddingProcessor
(
client
=
openai_client
)
embedding_processor
=
EmbeddingProcessor
(
client
=
openai_client
)
embedding_processor
.
process_structured_data_for_embeddings
(
# --- THIS IS THE FINAL, ROBUST CALL ---
embedding_processor
.
process_all_pages_for_embeddings_gapless
(
gemini_structured_data
,
gemini_structured_data
,
page_texts
,
page_texts
,
lang
,
lang
,
...
@@ -879,7 +903,8 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
...
@@ -879,7 +903,8 @@ def run_full_pipeline(pdf_path: str, grade: int, subject: str, output_json_path:
subject
,
subject
,
output_embeddings_csv_path
output_embeddings_csv_path
)
)
except
Exception
as
e
:
except
Exception
as
e
:
logging
.
critical
(
f
"Pipeline error: {e}"
,
exc_info
=
True
)
logging
.
critical
(
f
"Pipeline error: {e}"
,
exc_info
=
True
)
logging
.
info
(
f
"
\n
--- Pipeline finished for {pdf_path} ---"
)
logging
.
info
(
f
"
\n
--- Pipeline finished for {pdf_path} ---"
)
\ No newline at end of file
self_hosted_env/voice_agent/services/agent_service.py
View file @
6c10aac1
...
@@ -3,6 +3,7 @@ import os
...
@@ -3,6 +3,7 @@ import os
from
typing
import
List
,
Dict
,
Optional
from
typing
import
List
,
Dict
,
Optional
from
fastapi
import
HTTPException
from
fastapi
import
HTTPException
import
sys
import
sys
import
json
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
)))
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
)))
from
core
import
StudentNationality
,
Models
from
core
import
StudentNationality
,
Models
...
@@ -94,3 +95,132 @@ class AgentService:
...
@@ -94,3 +95,132 @@ class AgentService:
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"Error closing connection pools: {e}"
)
logger
.
error
(
f
"Error closing connection pools: {e}"
)
def
generate_and_store_mcqs
(
self
,
grade
:
int
,
subject
:
str
,
unit
:
str
,
concept
:
str
,
is_arabic
:
bool
,
num_questions
:
int
=
5
)
->
List
[
Dict
]:
"""
Generates NEW, UNIQUE MCQs for a topic by first retrieving existing ones
and instructing the AI to avoid generating duplicates.
"""
if
not
self
.
pgvector
:
raise
HTTPException
(
status_code
=
503
,
detail
=
"Vector service is not available for context retrieval."
)
# === STEP 1: RETRIEVE EXISTING QUESTIONS ===
logger
.
info
(
f
"Checking for existing questions for: {grade}/{subject}/{unit}/{concept}"
)
existing_questions
=
self
.
pgvector
.
get_mcqs
(
grade
,
subject
,
unit
,
concept
,
is_arabic
,
limit
=
None
# Fetch ALL existing questions
)
existing_questions_text
=
"No existing questions found."
if
existing_questions
:
# Format the existing questions into a simple list for the prompt
q_list
=
[
f
"- {q['question_text']}"
for
q
in
existing_questions
]
existing_questions_text
=
"
\n
"
.
join
(
q_list
)
logger
.
info
(
f
"Found {len(existing_questions)} existing questions. Will instruct AI to generate different ones."
)
# === STEP 2: RETRIEVE CURRICULUM CONTEXT ===
search_query
=
f
"summary of {concept} in {unit}"
query_embedding
=
self
.
openai_service
.
generate_embedding
(
search_query
)
context_chunks
=
self
.
pgvector
.
search_filtered_nearest
(
query_embedding
,
grade
,
subject
,
is_arabic
,
limit
=
10
)
if
not
context_chunks
:
raise
HTTPException
(
status_code
=
404
,
detail
=
"No curriculum context found for this topic in the specified language."
)
full_context
=
"
\n
---
\n
"
.
join
([
chunk
[
'chunk_text'
]
for
chunk
in
context_chunks
])
# === STEP 3: CREATE THE ADVANCED, AWARE PROMPT ===
if
is_arabic
:
prompt
=
f
"""
أنت خبير في تطوير المناهج ومهمتك إنشاء أسئلة اختيار من متعدد جديدة ومختلفة.
هذه هي الأسئلة الموجودة حاليًا في قاعدة البيانات حول المفهوم "{concept}":
--- الأسئلة الحالية ---
{existing_questions_text}
--- نهاية الأسئلة الحالية ---
اعتمادًا فقط على السياق التالي من المنهج:
--- السياق ---
{full_context}
--- نهاية السياق ---
يرجى توليد {num_questions} من أسئلة الاختيار من متعدد **الجديدة والمختلفة تمامًا** عن الأسئلة الموجودة أعلاه.
يجب أن تكون كل الأسئلة قابلة للإجابة مباشرة من السياق المقدم.
يجب أن يكون ردك مصفوفة JSON صحيحة. كل كائن يجب أن يحتوي على المفاتيح التالية:
- "question_text": نص السؤال.
- "correct_answer": الإجابة الصحيحة.
- "wrong_answer_1": إجابة خاطئة.
- "wrong_answer_2": إجابة خاطئة.
- "wrong_answer_3": إجابة خاطئة.
لا تكتب أي نص أو شرح خارج مصفوفة الـ JSON.
"""
else
:
prompt
=
f
"""
You are an expert curriculum developer tasked with creating new and unique multiple-choice questions.
Here are the questions that ALREADY EXIST in the database for the concept "{concept}":
--- EXISTING QUESTIONS ---
{existing_questions_text}
--- END EXISTING QUESTIONS ---
Based ONLY on the following context from the curriculum:
--- CONTEXT ---
{full_context}
--- END CONTEXT ---
Please generate {num_questions} NEW and COMPLETELY DIFFERENT multiple-choice questions from the list of existing ones above.
Each question must be answerable directly from the provided context. The questions and all answers MUST be in English.
Your response MUST be a valid JSON array of objects with these keys:
- "question_text"
- "correct_answer"
- "wrong_answer_1"
- "wrong_answer_2"
- "wrong_answer_3"
Do not include any text outside of the JSON array.
"""
# === STEP 4 & 5: CALL LLM, PARSE, and STORE (No changes here) ===
try
:
# ... (The entire try/except block for calling the LLM remains exactly the same)
response
=
self
.
openai_service
.
client
.
chat
.
completions
.
create
(
model
=
Models
.
chat
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
temperature
=
0.5
,
# Slightly higher temp for more creativity
response_format
=
{
"type"
:
"json_object"
}
)
response_content
=
response
.
choices
[
0
]
.
message
.
content
json_response
=
json
.
loads
(
response_content
)
generated_questions
=
[]
for
key
,
value
in
json_response
.
items
():
if
isinstance
(
value
,
list
):
generated_questions
=
value
break
if
not
generated_questions
:
raise
ValueError
(
"LLM did not return a list of questions in the JSON response."
)
except
(
json
.
JSONDecodeError
,
ValueError
,
KeyError
)
as
e
:
logger
.
error
(
f
"Failed to parse MCQ response from LLM: {e}
\n
Raw Response: {response_content}"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
"Failed to generate or parse MCQs from AI."
)
mcqs_to_store
=
[]
for
q
in
generated_questions
:
mcqs_to_store
.
append
({
"grade"
:
grade
,
"is_arabic"
:
is_arabic
,
"subject"
:
subject
,
"unit"
:
unit
,
"concept"
:
concept
,
"question_text"
:
q
[
"question_text"
],
"correct_answer"
:
q
[
"correct_answer"
],
"wrong_answer_1"
:
q
[
"wrong_answer_1"
],
"wrong_answer_2"
:
q
[
"wrong_answer_2"
],
"wrong_answer_3"
:
q
[
"wrong_answer_3"
],
})
self
.
pgvector
.
insert_mcqs
(
mcqs_to_store
)
return
mcqs_to_store
\ No newline at end of file
self_hosted_env/voice_agent/services/pgvector_service.py
View file @
6c10aac1
...
@@ -523,4 +523,60 @@ class PGVectorService:
...
@@ -523,4 +523,60 @@ class PGVectorService:
print
(
"="
*
50
)
print
(
"="
*
50
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
"❌ Database verification failed: {e}"
)
print
(
f
"❌ Database verification failed: {e}"
)
\ No newline at end of file
def
insert_mcqs
(
self
,
mcq_list
:
List
[
Dict
]):
"""
Inserts a batch of MCQs, now including the language flag.
"""
if
not
mcq_list
:
return
with
self
.
pool_handler
.
get_connection
()
as
conn
:
with
conn
.
cursor
()
as
cur
:
# --- UPDATED QUERY ---
insert_query
=
"""
INSERT INTO mcq_questions (
grade, is_arabic, subject, unit, concept, question_text,
correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
) VALUES (
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s,
%
s);
"""
# --- UPDATED DATA PREPARATION ---
data_to_insert
=
[
(
q
[
'grade'
],
q
[
'is_arabic'
],
q
[
'subject'
],
q
[
'unit'
],
q
[
'concept'
],
q
[
'question_text'
],
q
[
'correct_answer'
],
q
[
'wrong_answer_1'
],
q
[
'wrong_answer_2'
],
q
[
'wrong_answer_3'
]
)
for
q
in
mcq_list
]
cur
.
executemany
(
insert_query
,
data_to_insert
)
conn
.
commit
()
logger
.
info
(
f
"Successfully inserted {len(mcq_list)} MCQs into the database."
)
def
get_mcqs
(
self
,
grade
:
int
,
subject
:
str
,
unit
:
str
,
concept
:
str
,
is_arabic
:
bool
,
limit
:
Optional
[
int
]
=
10
)
->
List
[
Dict
]:
"""
Retrieves MCQs for a specific topic and language.
If limit is None, it retrieves all matching questions.
"""
with
self
.
pool_handler
.
get_connection
()
as
conn
:
with
conn
.
cursor
(
cursor_factory
=
RealDictCursor
)
as
cur
:
# Dynamically build the query based on the limit
query
=
"""
SELECT id, question_text, correct_answer, wrong_answer_1, wrong_answer_2, wrong_answer_3
FROM mcq_questions
WHERE grade =
%
s AND subject =
%
s AND unit =
%
s AND concept =
%
s AND is_arabic =
%
s
ORDER BY created_at DESC
"""
params
=
(
grade
,
subject
,
unit
,
concept
,
is_arabic
)
if
limit
is
not
None
:
query
+=
" LIMIT
%
s;"
params
+=
(
limit
,)
else
:
query
+=
";"
cur
.
execute
(
query
,
params
)
return
cur
.
fetchall
()
\ No newline at end of file
self_hosted_env/voice_agent/setup_mcq_table.py
0 → 100644
View file @
6c10aac1
import
psycopg2
import
os
from
dotenv
import
load_dotenv
load_dotenv
()
def
setup_mcq_table
(
drop_existing_table
:
bool
=
False
):
"""
Sets up the mcq_questions table in the database.
"""
try
:
conn
=
psycopg2
.
connect
(
host
=
os
.
getenv
(
"POSTGRES_HOST"
,
"localhost"
),
port
=
os
.
getenv
(
"POSTGRES_PORT"
,
"5432"
),
user
=
os
.
getenv
(
"POSTGRES_USER"
),
password
=
os
.
getenv
(
"POSTGRES_PASSWORD"
),
dbname
=
os
.
getenv
(
"POSTGRES_DB"
)
)
conn
.
autocommit
=
True
with
conn
.
cursor
()
as
cur
:
if
drop_existing_table
:
print
(
"Dropping existing mcq_questions table..."
)
cur
.
execute
(
"DROP TABLE IF EXISTS mcq_questions CASCADE;"
)
print
(
"Table dropped."
)
print
(
"Creating mcq_questions table..."
)
# --- THIS IS THE UPDATED TABLE SCHEMA ---
cur
.
execute
(
"""
CREATE TABLE IF NOT EXISTS mcq_questions (
id SERIAL PRIMARY KEY,
grade INTEGER NOT NULL,
is_arabic BOOLEAN NOT NULL, -- <-- ADDED THIS LINE
subject TEXT NOT NULL,
unit TEXT NOT NULL,
concept TEXT NOT NULL,
question_text TEXT NOT NULL,
correct_answer TEXT NOT NULL,
wrong_answer_1 TEXT NOT NULL,
wrong_answer_2 TEXT NOT NULL,
wrong_answer_3 TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""
)
print
(
"Creating indexes on mcq_questions table..."
)
# --- THIS IS THE UPDATED INDEX ---
cur
.
execute
(
"""
CREATE INDEX IF NOT EXISTS idx_mcq_topic
ON mcq_questions(grade, is_arabic, subject, unit, concept); -- <-- ADDED is_arabic
"""
)
print
(
"MCQ table setup complete."
)
except
Exception
as
e
:
print
(
f
"An error occurred during MCQ table setup: {e}"
)
finally
:
if
'conn'
in
locals
()
and
conn
:
conn
.
close
()
print
(
"Database connection closed."
)
if
__name__
==
"__main__"
:
# To apply the changes, it's best to drop and recreate the table.
# Be careful if you have existing data you want to keep!
print
(
"Creating MCQ table..."
)
setup_mcq_table
(
drop_existing_table
=
False
)
\ No newline at end of file
self_hosted_env/voice_agent/start.sh
View file @
6c10aac1
...
@@ -10,6 +10,8 @@ python insert_csv_embeddings.py
...
@@ -10,6 +10,8 @@ python insert_csv_embeddings.py
echo
"Database setup complete."
echo
"Database setup complete."
python curriculum_structure.py
python curriculum_structure.py
echo
"Curriculum structure setup complete."
echo
"Curriculum structure setup complete."
python setup_mcq_table.py
echo
"MCQ table setup complete."
sleep
5
sleep
5
# Start the web server and keep it as the main process
# Start the web server and keep it as the main process
...
...
test_cases/msq_test.py
0 → 100644
View file @
6c10aac1
"""
======================================================================
MCQ API Cookbook & Test Script
======================================================================
Purpose:
This script serves as both a live integration test and a practical guide ("cookbook")
for using the Multiple-Choice Question (MCQ) generation and retrieval API endpoints.
It demonstrates how to:
1. Generate and store new MCQs for a specific curriculum topic.
2. Retrieve existing MCQs from the database for that same topic.
----------------------------------------------------------------------
API Endpoints Guide
----------------------------------------------------------------------
There are two main endpoints for the MCQ feature:
1. Generate Questions (POST /mcq/generate)
------------------------------------------
This is the "creator" endpoint. It uses an AI model to generate a new set of questions
based on the curriculum content stored in the vector database. It then saves these
new questions to the `mcq_questions` table for future use.
- Method: POST
- URL: [BASE_URL]/mcq/generate
- Data Format: Must be sent as `application/x-www-form-urlencoded` (form data).
Parameters (Form Data):
- grade (int, required): The grade level of the curriculum (e.g., 4).
- subject (str, required): The subject of the curriculum (e.g., "Science").
- unit (str, required): The exact name of the unit.
- concept (str, required): The exact name of the concept.
- is_arabic (bool, required): Set to `true` for Arabic curriculum, `false` for English.
- count (int, optional, default=5): The number of new questions to generate.
Example Usage (using cURL):
curl -X POST [BASE_URL]/mcq/generate
\
-F "grade=4"
\
-F "subject=Science"
\
-F "unit=الوحدة الأولى: الأنظمة الحية"
\
-F "concept=المفهوم الأول: التكيف والبقاء"
\
-F "is_arabic=true"
\
-F "count=3"
2. Retrieve Questions (GET /mcq)
---------------------------------
This is the "reader" endpoint. It quickly and cheaply retrieves questions that have
already been generated and stored in the database. It does NOT call the AI model.
- Method: GET
- URL: [BASE_URL]/mcq
Parameters (URL Query Parameters):
- grade (int, required): The grade level.
- subject (str, required): The subject.
- unit (str, required): The unit name.
- concept (str, required): The concept name.
- is_arabic (bool, required): `true` for Arabic, `false` for English.
- limit (int, optional, default=None): The maximum number of questions to retrieve.
If omitted, it will retrieve ALL questions for that topic.
Example Usage (using cURL):
# Get the 5 most recent questions for a topic
curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true&limit=5"
# Get ALL questions for a topic
curl "[BASE_URL]/mcq?grade=4&subject=Science&unit=...&concept=...&is_arabic=true"
----------------------------------------------------------------------
How to Run This Script
----------------------------------------------------------------------
1. Ensure your FastAPI server is running.
2. Make sure the BASE_URL variable below is set to your server's address.
3. Run the script from your terminal: python3 msq_test.py
"""
import
requests
import
json
import
time
from
typing
import
Optional
# The base URL of your API server.
BASE_URL
=
"https://voice-agent.caprover.al-arcade.com"
def
test_mcq_generation
(
grade
:
int
,
subject
:
str
,
unit
:
str
,
concept
:
str
,
is_arabic
:
bool
,
count
:
int
):
"""
Tests the POST /mcq/generate endpoint.
"""
endpoint
=
f
"{BASE_URL}/mcq/generate"
payload
=
{
"grade"
:
grade
,
"subject"
:
subject
,
"unit"
:
unit
,
"concept"
:
concept
,
"is_arabic"
:
is_arabic
,
"count"
:
count
,
}
print
(
f
">> Attempting to GENERATE {count} new questions for:"
)
print
(
f
" Topic: Grade {grade} {subject} -> {unit} -> {concept}"
)
print
(
f
" Language: {'Arabic' if is_arabic else 'English'}"
)
try
:
response
=
requests
.
post
(
endpoint
,
data
=
payload
,
timeout
=
120
)
if
response
.
status_code
==
200
:
print
(
f
"SUCCESS: API returned status code {response.status_code}"
)
data
=
response
.
json
()
print
(
f
" Message: {data.get('message')}"
)
if
'questions'
in
data
and
data
[
'questions'
]:
print
(
"
\n
--- Details of Generated Questions ---"
)
for
i
,
q
in
enumerate
(
data
[
'questions'
],
1
):
print
(
f
" {i}. Question: {q['question_text']}"
)
print
(
f
" Correct: {q['correct_answer']}"
)
print
(
f
" Wrong 1: {q['wrong_answer_1']}"
)
print
(
f
" Wrong 2: {q['wrong_answer_2']}"
)
print
(
f
" Wrong 3: {q['wrong_answer_3']}
\n
"
)
return
True
else
:
print
(
f
"FAILED: API returned status code {response.status_code}"
)
try
:
error_data
=
response
.
json
()
print
(
f
" Error Detail: {error_data.get('detail', 'No detail provided.')}"
)
except
json
.
JSONDecodeError
:
print
(
f
" Response was not valid JSON: {response.text}"
)
return
False
except
requests
.
exceptions
.
RequestException
as
e
:
print
(
f
"FAILED: An error occurred while making the request: {e}"
)
return
False
def
test_mcq_retrieval
(
grade
:
int
,
subject
:
str
,
unit
:
str
,
concept
:
str
,
is_arabic
:
bool
,
limit
:
Optional
[
int
]):
"""
Tests the GET /mcq endpoint with detailed output.
"""
endpoint
=
f
"{BASE_URL}/mcq"
params
=
{
"grade"
:
grade
,
"subject"
:
subject
,
"unit"
:
unit
,
"concept"
:
concept
,
"is_arabic"
:
is_arabic
,
}
if
limit
is
not
None
:
params
[
"limit"
]
=
limit
limit_str
=
f
"up to {limit}"
if
limit
is
not
None
else
"ALL"
print
(
f
">> Attempting to RETRIEVE {limit_str} stored questions for the same topic..."
)
try
:
response
=
requests
.
get
(
endpoint
,
params
=
params
,
timeout
=
30
)
if
response
.
status_code
==
200
:
print
(
f
"SUCCESS: API returned status code {response.status_code}"
)
data
=
response
.
json
()
print
(
f
" Found {data.get('count')} stored questions in the database."
)
if
'questions'
in
data
and
data
[
'questions'
]:
print
(
"
\n
--- Details of Retrieved Questions ---"
)
for
i
,
q
in
enumerate
(
data
[
'questions'
],
1
):
print
(
f
" {i}. Question: {q['question_text']}"
)
print
(
f
" Correct: {q['correct_answer']}"
)
print
(
f
" Wrong 1: {q['wrong_answer_1']}"
)
print
(
f
" Wrong 2: {q['wrong_answer_2']}"
)
print
(
f
" Wrong 3: {q['wrong_answer_3']}
\n
"
)
elif
data
.
get
(
'count'
)
==
0
:
print
(
" (This is expected if this is the first time generating questions for this topic)"
)
return
True
else
:
print
(
f
"FAILED: API returned status code {response.status_code}"
)
try
:
error_data
=
response
.
json
()
print
(
f
" Error Detail: {error_data.get('detail', 'No detail provided.')}"
)
except
json
.
JSONDecodeError
:
print
(
f
" Response was not valid JSON: {response.text}"
)
return
False
except
requests
.
exceptions
.
RequestException
as
e
:
print
(
f
"FAILED: An error occurred while making the request: {e}"
)
return
False
if
__name__
==
"__main__"
:
print
(
"
\n
"
+
"="
*
50
)
print
(
"STARTING TEST 1: ARABIC MCQ GENERATION & RETRIEVAL"
)
print
(
"="
*
50
)
# IMPORTANT: Use actual Unit/Concept names from your database for the best results.
arabic_test_data
=
{
"grade"
:
4
,
"subject"
:
"Science"
,
"unit"
:
"الوحدة الأولى: الأنظمة الحية"
,
"concept"
:
"المفهوم الأول: التكيف والبقاء"
,
"is_arabic"
:
True
,
"count"
:
3
}
generation_successful
=
test_mcq_generation
(
**
arabic_test_data
)
if
generation_successful
:
print
(
"-"
*
25
)
time
.
sleep
(
2
)
test_mcq_retrieval
(
limit
=
None
,
**
{
k
:
v
for
k
,
v
in
arabic_test_data
.
items
()
if
k
!=
'count'
})
print
(
"
\n
"
+
"="
*
50
)
print
(
"STARTING TEST 2: ENGLISH MCQ GENERATION & RETRIEVAL"
)
print
(
"="
*
50
)
english_test_data
=
{
"grade"
:
5
,
"subject"
:
"Science"
,
"unit"
:
"Unit 1: Matter and Energy in Ecosystems"
,
"concept"
:
"Concept 1.1: Properties of Matter"
,
"is_arabic"
:
False
,
"count"
:
2
}
generation_successful
=
test_mcq_generation
(
**
english_test_data
)
if
generation_successful
:
print
(
"-"
*
25
)
time
.
sleep
(
2
)
test_mcq_retrieval
(
limit
=
None
,
**
{
k
:
v
for
k
,
v
in
english_test_data
.
items
()
if
k
!=
'count'
})
print
(
"
\n
"
+
"="
*
50
)
print
(
"All tests complete."
)
print
(
"="
*
50
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment