Commit 7ecadde7 authored by arwa mohamed's avatar arwa mohamed

Added extracted PDF data for Arabic and English curricula

parent ade545c7
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "WlPS1sR4XPT2"
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"source": [
"\n",
"\n",
">prime4\n",
"\n"
],
"metadata": {
"id": "gJWbo3uD8u_4"
}
},
{
"cell_type": "code",
"source": [
"!apt-get install tesseract-ocr\n",
"!pip install pdfplumber pdf2image pytesseract\n",
"!apt-get install -y poppler-utils\n",
"!sudo apt install tesseract-ocr -y\n",
"!sudo apt install tesseract-ocr-ara -y\n",
"!sudo apt install libtesseract-dev -y\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mXFZpTwLOjTZ",
"outputId": "a07e982d-7589-473a-c07e-baef61a227ed"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Collecting pdfplumber\n",
" Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pdf2image\n",
" Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
"Collecting pytesseract\n",
" Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
"Collecting pdfminer.six==20250506 (from pdfplumber)\n",
" Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
"Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
"Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
" Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
"Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
"Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
"Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
"Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m77.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
"Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" poppler-utils\n",
"0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 186 kB of archives.\n",
"After this operation, 697 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
"Fetched 186 kB in 1s (127 kB/s)\n",
"Selecting previously unselected package poppler-utils.\n",
"(Reading database ... 126374 files and directories currently installed.)\n",
"Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
"Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" tesseract-ocr-ara\n",
"0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 645 kB of archives.\n",
"After this operation, 1,447 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ara all 1:4.00~git30-7274cfa-1.1 [645 kB]\n",
"Fetched 645 kB in 2s (410 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package tesseract-ocr-ara.\n",
"(Reading database ... 126404 files and directories currently installed.)\n",
"Preparing to unpack .../tesseract-ocr-ara_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
"Unpacking tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following additional packages will be installed:\n",
" libarchive-dev libleptonica-dev\n",
"The following NEW packages will be installed:\n",
" libarchive-dev libleptonica-dev libtesseract-dev\n",
"0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 3,743 kB of archives.\n",
"After this operation, 16.0 MB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]\n",
"Fetched 3,743 kB in 2s (1,635 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package libarchive-dev:amd64.\n",
"(Reading database ... 126408 files and directories currently installed.)\n",
"Preparing to unpack .../libarchive-dev_3.6.0-1ubuntu1.5_amd64.deb ...\n",
"Unpacking libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
"Selecting previously unselected package libleptonica-dev.\n",
"Preparing to unpack .../libleptonica-dev_1.82.0-3build1_amd64.deb ...\n",
"Unpacking libleptonica-dev (1.82.0-3build1) ...\n",
"Selecting previously unselected package libtesseract-dev:amd64.\n",
"Preparing to unpack .../libtesseract-dev_4.1.1-2.1build1_amd64.deb ...\n",
"Unpacking libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
"Setting up libleptonica-dev (1.82.0-3build1) ...\n",
"Setting up libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
"Setting up libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import cv2\n",
"import pytesseract\n",
"import pandas as pd\n",
"from pdf2image import convert_from_path\n",
"from tqdm import tqdm\n",
"import re\n"
],
"metadata": {
"id": "OlXc31-OPKrF"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pdf_path = \"/content/Science_AR_prim4_TR1.pdf\"\n",
"output_csv = \"output_text_prime4_ar.csv\"\n",
"output_stats = \"output_stats_prime4_ar.csv\"\n",
"output_txt = \"output_text_prime4_ar.txt\"\n",
"images_dir = \"pdf_images_prime4_ar\"\n",
"batch_size = 50\n",
"low_word_threshold = 10\n",
"\n",
"os.makedirs(images_dir, exist_ok=True)"
],
"metadata": {
"id": "TO9y9WyOPN5l"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"📄 Converting PDF to images...\")\n",
"pages = convert_from_path(pdf_path, dpi=300)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FERBX8NtPUbM",
"outputId": "a1798c91-64d3-407e-9b6c-6d48de46b050"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"📄 Converting PDF to images...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"image_paths = []\n",
"for i, page in enumerate(pages, start=1):\n",
" img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
" page.save(img_path, \"PNG\")\n",
" image_paths.append(img_path)\n"
],
"metadata": {
"id": "y0UD8m4oPXDz"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def clean_text(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
" text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
" text = re.sub(r\"[_]{2,}\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
"\n",
" words = text.strip().split()\n",
" if len(words) <= 2:\n",
" return \"\"\n",
"\n",
" return text.strip()\n"
],
"metadata": {
"id": "o7u4lhRN1OjG"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def deep_clean_text(text):\n",
"\n",
" text = re.sub(r\"\\b\\d{4,}\\b\", \" \", text)\n",
"\n",
"\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" lines = text.split()\n",
" lines = [line for line in lines if not re.fullmatch(r\"[\\d\\W]+\", line)]\n",
"\n",
" return \" \".join(lines).strip()\n"
],
"metadata": {
"id": "ZNg2aeZ032qR"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
"results = []\n",
"\n",
"for batch_start in range(0, len(image_paths), batch_size):\n",
" batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
" for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
" page_number = batch_start + i + 1\n",
"\n",
" img = cv2.imread(img_path)\n",
"\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"\n",
" _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
"\n",
" text = pytesseract.image_to_string(thresh, lang=\"ara\")\n",
"\n",
" cleaned_text = clean_text(text)\n",
" cleaned_text = deep_clean_text(cleaned_text)\n",
"\n",
"\n",
" if cleaned_text:\n",
" results.append({\"page\": page_number, \"text\": cleaned_text})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_51dWZ7c1Pyv",
"outputId": "221396c0-6b91-43bb-8051-4941f78d6416"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"🔎 Running OCR on pages... (Batch mode)\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Batch 1: 100%|██████████| 50/50 [04:00<00:00, 4.82s/it]\n",
"Batch 2: 100%|██████████| 50/50 [03:32<00:00, 4.24s/it]\n",
"Batch 3: 100%|██████████| 28/28 [02:02<00:00, 4.36s/it]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"print(\"💾 Saving results...\")\n",
"\n",
"df = pd.DataFrame(results)\n",
"df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
" for _, row in df.iterrows():\n",
" f.write(f\"\\n--- الصفحة {row['page']} ---\\n{row['text']}\\n\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jD3jAZIG1far",
"outputId": "ab2c84ae-a077-4e9d-95be-5077e4552df8"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"💾 Saving results...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
"stats = {\n",
" \"total_pages\": len(df),\n",
" \"total_words\": df[\"word_count\"].sum(),\n",
" \"avg_words_per_page\": round(df[\"word_count\"].mean(), 2)\n",
"}\n",
"stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
"stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ انتهى OCR للعربي!\")\n",
"print(f\"📂 ملف CSV: {output_csv}\")\n",
"print(f\"📂 ملف TXT: {output_txt}\")\n",
"print(f\"📊 ملف الإحصائيات: {output_stats}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Uq0XYAw11hZY",
"outputId": "67d8ff69-e9ff-4d93-ccf9-ba122f8ec7de"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ انتهى OCR للعربي!\n",
"📂 ملف CSV: output_text_prime4_ar.csv\n",
"📂 ملف TXT: output_text_prime4_ar.txt\n",
"📊 ملف الإحصائيات: output_stats_prime4_ar.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"def super_clean_text(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\b\\d{3,}\\b\", \" \", text)\n",
"\n",
" text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
" text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
" text = re.sub(r\"[_]{2,}\", \" \", text)\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" words = text.split()\n",
" words = [w for w in words if not re.fullmatch(r\"[\\d\\W]+\", w)]\n",
"\n",
" text = \" \".join(words)\n",
" text = re.sub(r\"\\s+\", \" \", text).strip()\n",
"\n",
" return text\n",
"\n",
"df = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
"\n",
"df_cleaned = pd.DataFrame()\n",
"df_cleaned[\"text\"] = df[\"text\"].apply(super_clean_text)\n",
"\n",
"df_cleaned.to_csv(\"output_text_ar_cleaned_prime4.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\")\n"
],
"metadata": {
"id": "tNcXdjtL6PWe",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f29c6c84-710a-47d2-99b4-9ff310b5f302"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"structure = {\n",
" \"الوحدة الأولى: الأنظمة الحية\": {\n",
" \"مقدمة\": (9, 12),\n",
" \"المفاهيم\": {\n",
" \"المفهوم 1.1: التكيف والبقاء\": {\n",
" \"الدرس الأول\": (13, 16),\n",
" \"الدرس الثاني\": (17, 22),\n",
" \"الدرس الثالث\": (23, 29),\n",
" \"الدرس الرابع\": (30, 31),\n",
" \"الدرس الخامس\": (32, 34),\n",
" },\n",
" \"المفهوم 2.1: كيف تعمل الحواس؟\": {\n",
" \"الدرس الأول\": (35, 38),\n",
" \"الدرس الثاني\": (39, 41),\n",
" \"الدرس الثالث\": (42, 44),\n",
" \"الدرس الرابع\": (45, 48),\n",
" },\n",
" \"المفهوم 3.1: الضوء وحاسة البصر\": {\n",
" \"الدرس الأول\": (49, 54),\n",
" \"الدرس الثاني\": (55, 58),\n",
" \"الدرس الثالث\": (59, 60),\n",
" \"الدرس الرابع\": (61, 62),\n",
" }\n",
" },\n",
" \"مشروع الوحدة\": (63, 64),\n",
" \"المشروع بيني التخصصات\": (65, 72),\n",
" \"قيم تعلمك\": (73, 74)\n",
" },\n",
" \"الوحدة الثانية: الحركة والطاقة\": {\n",
" \"مقدمة\": (75, 78),\n",
" \"المفاهيم\": {\n",
" \"المفهوم 1.2: الحركة والتوقف\": {\n",
" \"الدرس الأول\": (79, 82),\n",
" \"الدرس الثاني\": (83, 85),\n",
" \"الدرس الثالث\": (86, 89),\n",
" \"الدرس الرابع\": (90, 92),\n",
" },\n",
" \"المفهوم 2.2: الطاقة والحركة\": {\n",
" \"الدرس الأول\": (93, 96),\n",
" \"الدرس الثاني\": (97, 98),\n",
" \"الدرس الثالث\": (99, 102),\n",
" \"الدرس الرابع\": (103, 104),\n",
" },\n",
" \"المفهوم 3.2: الطاقة والتصادم\": {\n",
" \"الدرس الأول\": (105, 109),\n",
" \"الدرس الثاني\": (110, 113),\n",
" \"الدرس الثالث\": (114, 115),\n",
" \"الدرس الرابع\": (116, 120),\n",
" }\n",
"\n",
" },\n",
" \"مشروع الوحدة\": (121, 122),\n",
" \"قيم تعلمك\": (123, 124),\n",
" \"السلامة في فصول العلوم\": (125, 126)\n",
" }\n",
"}\n",
"\n",
"df = pd.read_csv(\"/content/output_text_ar_cleaned_prime4.csv\")\n",
"df_raw = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
"df[\"page\"] = df_raw[\"page\"]\n",
"\n",
"\n",
"rows = []\n",
"\n",
"\n",
"df = pd.read_csv(\"/content/output_text_ar_cleaned_prime4.csv\")\n",
"df_raw = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
"df[\"page\"] = df_raw[\"page\"]\n",
"\n",
"rows = []\n",
"\n",
"for unit, udata in structure.items():\n",
" if \"مقدمة\" in udata:\n",
" start, end = udata[\"مقدمة\"]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": \"مقدمة\",\n",
" \"الدرس\": \"مقدمة\",\n",
" \"من صفحة\": start,\n",
" \"إلى صفحة\": end,\n",
" \"النص\": text\n",
" })\n",
"\n",
" if \"المفاهيم\" in udata:\n",
" for concept, lessons in udata[\"المفاهيم\"].items():\n",
" for lesson, (ls, le) in lessons.items():\n",
" lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": concept,\n",
" \"الدرس\": lesson,\n",
" \"من صفحة\": ls,\n",
" \"إلى صفحة\": le,\n",
" \"النص\": lesson_text\n",
" })\n",
"\n",
" for section in [\"مشروع الوحدة\", \"المشروع بيني التخصصات\", \"قيم تعلمك\", \"السلامة في فصول العلوم\"]:\n",
" if section in udata:\n",
" start, end = udata[section]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": section,\n",
" \"الدرس\": section,\n",
" \"من صفحة\": start,\n",
" \"إلى صفحة\": end,\n",
" \"النص\": text\n",
" })\n",
"\n",
"\n",
"df_out = pd.DataFrame(rows)\n",
"df_out.to_csv(\"output_units_lessons_prime4.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ تم تقسيم النصوص للوحدات والدروس وحفظها في output_units_lessons_prime4.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7tm5OFaHFL_c",
"outputId": "f788fe87-079d-47b0-f262-f0a94d4fc381"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ تم تقسيم النصوص للوحدات والدروس وحفظها في output_units_lessons_prime4.csv\n"
]
}
]
}
]
}
\ No newline at end of file
metric,value
total_pages,126.0
total_words,17123.0
avg_words_per_page,135.9
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"prime6"
],
"metadata": {
"id": "ksTOxAhBBfel"
}
},
{
"cell_type": "code",
"source": [
"!apt-get install tesseract-ocr\n",
"!pip install pdfplumber pdf2image pytesseract\n",
"!apt-get install -y poppler-utils\n",
"!sudo apt install tesseract-ocr -y\n",
"!sudo apt install tesseract-ocr-ara -y\n",
"!sudo apt install libtesseract-dev -y\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mXFZpTwLOjTZ",
"outputId": "c5f27dcf-36ab-4d1b-fde5-18a645d0c70e"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Collecting pdfplumber\n",
" Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pdf2image\n",
" Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
"Collecting pytesseract\n",
" Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
"Collecting pdfminer.six==20250506 (from pdfplumber)\n",
" Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
"Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
"Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
" Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
"Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
"Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m38.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
"Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
"Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m77.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
"Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" poppler-utils\n",
"0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 186 kB of archives.\n",
"After this operation, 697 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
"Fetched 186 kB in 1s (137 kB/s)\n",
"Selecting previously unselected package poppler-utils.\n",
"(Reading database ... 126374 files and directories currently installed.)\n",
"Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
"Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" tesseract-ocr-ara\n",
"0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 645 kB of archives.\n",
"After this operation, 1,447 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ara all 1:4.00~git30-7274cfa-1.1 [645 kB]\n",
"Fetched 645 kB in 2s (324 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package tesseract-ocr-ara.\n",
"(Reading database ... 126404 files and directories currently installed.)\n",
"Preparing to unpack .../tesseract-ocr-ara_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
"Unpacking tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
"Setting up tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following additional packages will be installed:\n",
" libarchive-dev libleptonica-dev\n",
"The following NEW packages will be installed:\n",
" libarchive-dev libleptonica-dev libtesseract-dev\n",
"0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 3,743 kB of archives.\n",
"After this operation, 16.0 MB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]\n",
"Fetched 3,743 kB in 3s (1,166 kB/s)\n",
"debconf: unable to initialize frontend: Dialog\n",
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n",
"debconf: falling back to frontend: Readline\n",
"debconf: unable to initialize frontend: Readline\n",
"debconf: (This frontend requires a controlling tty.)\n",
"debconf: falling back to frontend: Teletype\n",
"dpkg-preconfigure: unable to re-open stdin: \n",
"Selecting previously unselected package libarchive-dev:amd64.\n",
"(Reading database ... 126408 files and directories currently installed.)\n",
"Preparing to unpack .../libarchive-dev_3.6.0-1ubuntu1.5_amd64.deb ...\n",
"Unpacking libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
"Selecting previously unselected package libleptonica-dev.\n",
"Preparing to unpack .../libleptonica-dev_1.82.0-3build1_amd64.deb ...\n",
"Unpacking libleptonica-dev (1.82.0-3build1) ...\n",
"Selecting previously unselected package libtesseract-dev:amd64.\n",
"Preparing to unpack .../libtesseract-dev_4.1.1-2.1build1_amd64.deb ...\n",
"Unpacking libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
"Setting up libleptonica-dev (1.82.0-3build1) ...\n",
"Setting up libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
"Setting up libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import cv2\n",
"import pytesseract\n",
"import pandas as pd\n",
"from pdf2image import convert_from_path\n",
"from tqdm import tqdm\n",
"import re\n"
],
"metadata": {
"id": "OlXc31-OPKrF"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pdf_path = \"/content/Science_AR_prim6_TR1.pdf\"\n",
"output_csv = \"output_text_prime6_ar.csv\"\n",
"output_stats = \"output_stats_prime6_ar.csv\"\n",
"output_txt = \"output_text_prime6_ar.txt\"\n",
"images_dir = \"pdf_images_prime6_ar\"\n",
"batch_size = 50\n",
"low_word_threshold = 10\n",
"\n",
"os.makedirs(images_dir, exist_ok=True)"
],
"metadata": {
"id": "104x_y2OBp7N"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"📄 Converting PDF to images...\")\n",
"pages = convert_from_path(pdf_path, dpi=300)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "50fd1372-0159-4de7-a7fc-e4b2d73a1965",
"id": "sCdYDu_EB1aV"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"📄 Converting PDF to images...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"image_paths = []\n",
"for i, page in enumerate(pages, start=1):\n",
" img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
" page.save(img_path, \"PNG\")\n",
" image_paths.append(img_path)\n"
],
"metadata": {
"id": "MpPApbwEB5FG"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def clean_text(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
" text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
" text = re.sub(r\"[_]{2,}\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
"\n",
" words = text.strip().split()\n",
" if len(words) <= 2:\n",
" return \"\"\n",
"\n",
" return text.strip()\n"
],
"metadata": {
"id": "UEerCWkVB8tF"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def deep_clean_text(text):\n",
" text = re.sub(r\"\\b\\d{4,}\\b\", \" \", text)\n",
"\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" lines = text.split()\n",
" lines = [line for line in lines if not re.fullmatch(r\"[\\d\\W]+\", line)]\n",
"\n",
" return \" \".join(lines).strip()\n"
],
"metadata": {
"id": "PjArDwlKCAVu"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
"results = []\n",
"\n",
"for batch_start in range(0, len(image_paths), batch_size):\n",
" batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
" for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
" page_number = batch_start + i + 1\n",
"\n",
" img = cv2.imread(img_path)\n",
"\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"\n",
" _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
"\n",
" text = pytesseract.image_to_string(thresh, lang=\"ara\")\n",
"\n",
" cleaned_text = clean_text(text)\n",
" cleaned_text = deep_clean_text(cleaned_text)\n",
"\n",
"\n",
" if cleaned_text:\n",
" results.append({\"page\": page_number, \"text\": cleaned_text})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8ef4f4a3-e725-4c2b-8a05-4bde004fc152",
"id": "uHC2MD9XCHq1"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"🔎 Running OCR on pages... (Batch mode)\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Batch 1: 100%|██████████| 50/50 [04:11<00:00, 5.03s/it]\n",
"Batch 2: 100%|██████████| 50/50 [04:25<00:00, 5.32s/it]\n",
"Batch 3: 100%|██████████| 37/37 [03:40<00:00, 5.96s/it]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"\n",
"print(\"💾 Saving results...\")\n",
"\n",
"df = pd.DataFrame(results)\n",
"df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
" for _, row in df.iterrows():\n",
" f.write(f\"\\n--- الصفحة {row['page']} ---\\n{row['text']}\\n\")"
],
"metadata": {
"id": "x1rPk23yCNzN",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "943af302-80c8-4ca7-9a48-7ad802a38cd4"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"💾 Saving results...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
"stats = {\n",
" \"total_pages\": len(df),\n",
" \"total_words\": df[\"word_count\"].sum(),\n",
" \"avg_words_per_page\": round(df[\"word_count\"].mean(), 2)\n",
"}\n",
"stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
"stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ انتهى OCR للعربي!\")\n",
"print(f\"📂 ملف CSV: {output_csv}\")\n",
"print(f\"📂 ملف TXT: {output_txt}\")\n",
"print(f\"📊 ملف الإحصائيات: {output_stats}\")"
],
"metadata": {
"id": "PZ0AB3cGCQ2G",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a037f0ad-5a1a-4a42-9233-6c2a42673178"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ انتهى OCR للعربي!\n",
"📂 ملف CSV: output_text_prime6_ar.csv\n",
"📂 ملف TXT: output_text_prime6_ar.txt\n",
"📊 ملف الإحصائيات: output_stats_prime6_ar.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"def super_clean_text(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\b\\d{3,}\\b\", \" \", text)\n",
"\n",
" text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
" text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
" text = re.sub(r\"[_]{2,}\", \" \", text)\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" words = text.split()\n",
" words = [w for w in words if not re.fullmatch(r\"[\\d\\W]+\", w)]\n",
"\n",
" text = \" \".join(words)\n",
" text = re.sub(r\"\\s+\", \" \", text).strip()\n",
"\n",
" return text\n",
"\n",
"df = pd.read_csv(\"/content/output_text_prime6_ar.csv\")\n",
"df_cleaned = pd.DataFrame()\n",
"df_cleaned[\"text\"] = df[\"text\"].apply(super_clean_text)\n",
"\n",
"df_cleaned.to_csv(\"output_text_ar_cleaned_prime6.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "c4e0009f-ee32-4bde-de03-1c2d3ee1aa4e",
"id": "ZQHx0Z6VCTkP"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"structure = {\n",
" \"الوحدة الأولى: ما النظام؟\": {\n",
" \"مقدمة\": (8, 11),\n",
" \"المفاهيم\": {\n",
" \"المفهوم 1.1: الخليه كنظام\": {\n",
" \"الدرس الأول\": (12, 16),\n",
" \"الدرس الثاني\": (17, 19),\n",
" \"الدرس الثالث\": (20, 22),\n",
" \"الدرس الرابع\": (23, 26),\n",
" \"الدرس الخامس\": (27, 29),\n",
" \"الدرس السادس\": (30, 33)\n",
"\n",
" },\n",
" \"المفهوم 2.1: الجسم كنظام\": {\n",
" \"الدرس الأول\": (34, 37),\n",
" \"الدرس الثاني\": (38, 40),\n",
" \"الدرس الثالث\": (41, 44),\n",
" \"الدرس الرابع\": (45, 47),\n",
" \"الدرس الخامس\": (48, 50),\n",
" \"الدرس السادس\": (51, 53)\n",
"\n",
" },\n",
" \"المفهوم 3.1: الطاقة كنظام\": {\n",
" \"الدرس الأول\": (54, 56),\n",
" \"الدرس الثاني\": (57, 59),\n",
" \"الدرس الثالث\": (60, 64),\n",
" \"الدرس الرابع\": (65, 66),\n",
" \"الدرس الخامس\": (67, 70),\n",
" \"الدرس السادس\": (71, 72)\n",
"\n",
" },\n",
" },\n",
" \"مشروع الوحدة\": (73, 75),\n",
" \"تقييم الوحدة\": (76, 79)\n",
" },\n",
"\n",
" \"الوحدة الثانية: الحصول على الطاقة\": {\n",
" \"مقدمة\": (80, 83),\n",
" \"المفاهيم\": {\n",
" \"المفهوم 1.2: الطاقة الحرارية وحالات المادة\": {\n",
" \"الدرس الأول\": (84, 88),\n",
" \"الدرس الثاني\": (89, 91),\n",
" \"الدرس الثالث\": (92, 93),\n",
" \"الدرس الرابع\": (94, 95),\n",
" \"الدرس الخامس\": (96, 98),\n",
" \"الدرس السادس\": (99, 101),\n",
"\n",
" },\n",
" \"المفهوم 2.2: انتقال الحرارة\": {\n",
" \"الدرس الأول\": (102, 105),\n",
" \"الدرس الثاني\": (106, 109),\n",
" \"الدرس الثالث\": (110, 111),\n",
" \"الدرس الرابع\": (112, 113),\n",
" \"الدرس الخامس\": (114, 116),\n",
" \"الدرس السادس\": (117, 119),\n",
"\n",
" }\n",
" },\n",
" \"مشروع الوحدة\": (120, 121),\n",
" \"المشروع بيني التخصصات\": (122, 130),\n",
" \"تقييم الوحدة\": (131, 134)\n",
" }\n",
"}\n",
"\n",
"df = pd.read_csv(\"/content/output_text_ar_cleaned_prime6.csv\")\n",
"df_raw = pd.read_csv(\"/content/output_text_prime6_ar.csv\")\n",
"df[\"page\"] = df_raw[\"page\"]\n",
"\n",
"rows = []\n",
"\n",
"rows = []\n",
"\n",
"for unit, udata in structure.items():\n",
" if \"مقدمة\" in udata:\n",
" start, end = udata[\"مقدمة\"]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": \"مقدمة\",\n",
" \"الدرس\": \"مقدمة\",\n",
" \"من صفحة\": start,\n",
" \"إلى صفحة\": end,\n",
" \"النص\": text\n",
" })\n",
"\n",
" if \"المفاهيم\" in udata:\n",
" for concept, lessons in udata[\"المفاهيم\"].items():\n",
" for lesson, (ls, le) in lessons.items():\n",
" lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": concept,\n",
" \"الدرس\": lesson,\n",
" \"من صفحة\": ls,\n",
" \"إلى صفحة\": le,\n",
" \"النص\": lesson_text\n",
" })\n",
"\n",
" for section in [\"مشروع الوحدة\", \"المشروع بيني التخصصات\", \"قيم تعلمك\", \"السلامة في فصول العلوم\"]:\n",
" if section in udata:\n",
" start, end = udata[section]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"الوحدة\": unit,\n",
" \"المفهوم\": section,\n",
" \"الدرس\": section,\n",
" \"من صفحة\": start,\n",
" \"إلى صفحة\": end,\n",
" \"النص\": text\n",
" })\n",
"\n",
"\n",
"\n",
"df_out = pd.DataFrame(rows)\n",
"df_out.to_csv(\"output_units_lessons_prime6.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ تم تقسيم النصوص للوحدات والدروس (الصف السادس) وحفظها في output_units_lessons_prime6.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WWIa8R-aQ_Kd",
"outputId": "5057cd82-79fc-4af6-fe18-5dc6358cc0e5"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ تم تقسيم النصوص للوحدات والدروس (الصف السادس) وحفظها في output_units_lessons_prime6.csv\n"
]
}
]
}
]
}
\ No newline at end of file
metric,value
total_pages,134.0
total_words,21334.0
avg_words_per_page,159.21
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ATlQdSWeYEwu"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"source": [
"!apt-get install tesseract-ocr\n",
"!pip install pdfplumber pdf2image pytesseract\n",
"!apt-get install -y poppler-utils\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mXFZpTwLOjTZ",
"outputId": "181e3c60-149b-4582-e882-17f6fbf16113"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Collecting pdfplumber\n",
" Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pdf2image\n",
" Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
"Collecting pytesseract\n",
" Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
"Collecting pdfminer.six==20250506 (from pdfplumber)\n",
" Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
"Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
"Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
" Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
"Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
"Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m79.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
"Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
"Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m101.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
"Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" poppler-utils\n",
"0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
"Need to get 186 kB of archives.\n",
"After this operation, 697 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
"Fetched 186 kB in 1s (196 kB/s)\n",
"Selecting previously unselected package poppler-utils.\n",
"(Reading database ... 126374 files and directories currently installed.)\n",
"Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
"Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import cv2\n",
"import pytesseract\n",
"import pandas as pd\n",
"from pdf2image import convert_from_path\n",
"from tqdm import tqdm\n",
"import re\n"
],
"metadata": {
"id": "OlXc31-OPKrF"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pdf_path = \"/content/Science_EN_prim5_TR1 (1).pdf\"\n",
"output_csv = \"output_text.csv\"\n",
"output_stats = \"output_stats.csv\"\n",
"output_txt = \"output_text.txt\"\n",
"images_dir = \"pdf_images\"\n",
"batch_size = 50\n",
"low_word_threshold = 10\n",
"os.makedirs(images_dir, exist_ok=True)"
],
"metadata": {
"id": "u1W69-mMyhMG"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"📄 Converting PDF to images...\")\n",
"pages = convert_from_path(pdf_path, dpi=300)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a8e9b878-0c10-4a6b-ea99-92fc7caa1d99",
"id": "C_LneoyryryP"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"📄 Converting PDF to images...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"image_paths = []\n",
"for i, page in enumerate(pages, start=1):\n",
" img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
" page.save(img_path, \"PNG\")\n",
" image_paths.append(img_path)\n"
],
"metadata": {
"id": "wgX68ISuyvIn"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def clean_text(text):\n",
" text = str(text)\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
" text = re.sub(r\"[^\\x00-\\x7F\\u0600-\\u06FF]+\", \" \", text)\n",
" return text.strip()"
],
"metadata": {
"id": "IRBX4I2bhZng"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
"results = []\n",
"\n",
"for batch_start in range(0, len(image_paths), batch_size):\n",
" batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
" for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
" page_number = batch_start + i + 1\n",
"\n",
" img = cv2.imread(img_path)\n",
"\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"\n",
" _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
"\n",
" text = pytesseract.image_to_string(thresh, lang=\"eng\")\n",
"\n",
" cleaned_text = clean_text(text)\n",
"\n",
" results.append({\"page\": page_number, \"text\": cleaned_text})\n"
],
"metadata": {
"id": "ZAwUNKFkyz7m",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "80adc825-c300-400f-9a8b-c52dc1f5c21b"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"🔎 Running OCR on pages... (Batch mode)\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Batch 1: 100%|██████████| 50/50 [02:54<00:00, 3.50s/it]\n",
"Batch 2: 100%|██████████| 50/50 [02:51<00:00, 3.44s/it]\n",
"Batch 3: 100%|██████████| 29/29 [01:52<00:00, 3.89s/it]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(\"💾 Saving results...\")\n",
"\n",
"df = pd.DataFrame(results)\n",
"df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
" for _, row in df.iterrows():\n",
" f.write(f\"\\n--- Page {row['page']} ---\\n{row['text']}\\n\")\n"
],
"metadata": {
"id": "wA9oIFuMy3gA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "397832ee-5722-43aa-d66d-b85e5f1768c2"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"💾 Saving results...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
"stats = {\n",
" \"total_pages\": len(df),\n",
" \"total_words\": df[\"word_count\"].sum(),\n",
" \"avg_words_per_page\": df[\"word_count\"].mean()\n",
"}\n",
"stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
"stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ Done!\")\n",
"print(f\"📂 Text file saved: {output_csv}\")\n",
"print(f\"📂 Clean text file saved: {output_txt}\")\n",
"print(f\"📊 Stats file saved: {output_stats}\")"
],
"metadata": {
"id": "CPYgXgLMy6Xh",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "785517e6-145c-4ead-a012-353340643741"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Done!\n",
"📂 Text file saved: output_text.csv\n",
"📂 Clean text file saved: output_text.txt\n",
"📊 Stats file saved: output_stats.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"def super_clean_text_en(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^a-zA-Z0-9\\s\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\b(\\w+)( \\1){2,}\\b\", r\"\\1\", text)\n",
"\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" text = re.sub(r\"(\\d)\\1{2,}\", r\"\\1\\1\\1\", text)\n",
"\n",
" words = text.split()\n",
" cleaned_words = []\n",
" for w in words:\n",
" if len(w) >= 15:\n",
" continue\n",
" cleaned_words.append(w)\n",
" text = \" \".join(cleaned_words)\n",
"\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
"\n",
" lines = text.split(\".\")\n",
" seen = set()\n",
" cleaned_lines = []\n",
" for line in lines:\n",
" l = line.strip()\n",
" if l and l not in seen:\n",
" cleaned_lines.append(l)\n",
" seen.add(l)\n",
" text = \". \".join(cleaned_lines)\n",
"\n",
" return text.strip()\n",
"\n",
"df = pd.read_csv(\"/content/output_text_prime5_en.csv\")\n",
"\n",
"df[\"text\"] = df[\"text\"].apply(super_clean_text_en)\n",
"\n",
"\n",
"df.to_csv(\"output_text_en_cleaned.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9f2f030a-f8ab-448f-fa95-b4b18a017fa1",
"id": "kko7q5b7VHP5"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"# ---------------- structure ----------------\n",
"structure = {\n",
" \"Unit 1: Interactions of Organisms\": {\n",
" \"Get Started\": (10, 13),\n",
" \"Concepts\": {\n",
" \"Concept 1.1 Plant Needs\": {\n",
" \"Lesson 1\": (14, 17),\n",
" \"Lesson 2\": (18, 21),\n",
" \"Lesson 3\": (22, 25),\n",
" \"Lesson 4\": (26, 29),\n",
" \"Lesson 5\": (30, 33),\n",
" },\n",
" \"Concept 1.2 Energy Flow in Ecosystems\": {\n",
" \"Lesson 1\": (34, 37),\n",
" \"Lesson 2\": (38, 40),\n",
" \"Lesson 3\": (41, 43),\n",
" \"Lesson 4\": (44, 45),\n",
" },\n",
" \"Concept 1.3 Changes in Food Webs\": {\n",
" \"Lesson 1\": (46, 50),\n",
" \"Lesson 2\": (51, 53),\n",
" \"Lesson 3\": (54, 55),\n",
" \"Lesson 4\": (56, 58),\n",
" }\n",
" },\n",
" \"Unit Project\": (59, 59),\n",
" \"Interdisciplinary project\": (60, 67),\n",
" \"Assess your learning\": (68, 69)\n",
" },\n",
" \"Unit 2: Particles in Motion\": {\n",
" \"Get Started\": (70, 73),\n",
" \"Concepts\": {\n",
" \"Concept 2.1 Matter in the World around Us\": {\n",
" \"Lesson 1\": (74, 76),\n",
" \"Lesson 2\": (77, 80),\n",
" \"Lesson 3\": (81, 83),\n",
" \"Lesson 4\": (84, 86),\n",
" \"Lesson 5\": (87, 89),\n",
" },\n",
" \"Concept 2.2 Describing and Measuring Matter\": {\n",
" \"Lesson 1\": (90, 93),\n",
" \"Lesson 2\": (94, 95),\n",
" \"Lesson 3\": (96, 100),\n",
" \"Lesson 4\": (101, 103),\n",
" },\n",
" \"Concept 2.3 Comparing Changes in Matter\": {\n",
" \"Lesson 1\": (104, 107),\n",
" \"Lesson 2\": (108, 111),\n",
" \"Lesson 3\": (112, 116),\n",
" \"Lesson 4\": (117, 120),\n",
" \"Lesson 5\": (121, 123),\n",
" }\n",
" },\n",
" \"Unit Project\": (124, 125),\n",
" \"Assess your learning\": (126, 127)\n",
" },\n",
"}\n",
"\n",
"# ---------------- load OCR text ----------------\n",
"df = pd.read_csv(\"/content/output_text_en_cleaned.csv\")\n",
"df_raw = pd.read_csv(\"/content/output_text_prime5_en.csv\")\n",
"df[\"page\"] = df_raw[\"page\"]\n",
"\n",
"rows = []\n",
"\n",
"# ---------------- split text ----------------\n",
"for unit, udata in structure.items():\n",
" # Get Started\n",
" if \"Get Started\" in udata:\n",
" start, end = udata[\"Get Started\"]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": \"Get Started\",\n",
" \"Lesson\": \"Get Started\",\n",
" \"From page\": start,\n",
" \"To page\": end,\n",
" \"Lesson text\": text\n",
" })\n",
"\n",
" # Concepts & Lessons\n",
" if \"Concepts\" in udata:\n",
" for concept, lessons in udata[\"Concepts\"].items():\n",
" for lesson, (ls, le) in lessons.items():\n",
" lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": concept,\n",
" \"Lesson\": lesson,\n",
" \"From page\": ls,\n",
" \"To page\": le,\n",
" \"Lesson text\": lesson_text\n",
" })\n",
"\n",
" # Other sections\n",
" for section in [\"Unit Project\", \"Assess your learning\", \"Interdisciplinary project\"]:\n",
" if section in udata:\n",
" start, end = udata[section]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": section,\n",
" \"Lesson\": section,\n",
" \"From page\": start,\n",
" \"To page\": end,\n",
" \"Lesson text\": text\n",
" })\n",
"\n",
"# ---------------- save output ----------------\n",
"df_out = pd.DataFrame(rows)\n",
"df_out.to_csv(\"output_units_lessons_prime5_EN.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ Done! Lessons & units split and saved to output_units_lessons_prime5_EN.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pICh8K7rnZyd",
"outputId": "cc0d291d-e292-494c-fc15-310f5bb047e7"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Done! Lessons & units split and saved to output_units_lessons_prime5_EN.csv\n"
]
}
]
}
]
}
\ No newline at end of file
metric,value
total_pages,129.0
total_words,24317.0
avg_words_per_page,188.50387596899225
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"source": [
"!apt-get install tesseract-ocr\n",
"!pip install pdfplumber pdf2image pytesseract\n",
"!apt-get install -y poppler-utils\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mXFZpTwLOjTZ",
"outputId": "7bea32f6-f53f-4946-c7ba-31b3b03059b7"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
"Requirement already satisfied: pdfplumber in /usr/local/lib/python3.12/dist-packages (0.11.7)\n",
"Requirement already satisfied: pdf2image in /usr/local/lib/python3.12/dist-packages (1.17.0)\n",
"Requirement already satisfied: pytesseract in /usr/local/lib/python3.12/dist-packages (0.3.13)\n",
"Requirement already satisfied: pdfminer.six==20250506 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (20250506)\n",
"Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
"Requirement already satisfied: pypdfium2>=4.18.0 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (4.30.0)\n",
"Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
"Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
"Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"poppler-utils is already the newest version (22.02.0-2ubuntu0.10).\n",
"0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import cv2\n",
"import pytesseract\n",
"import pandas as pd\n",
"from pdf2image import convert_from_path\n",
"from tqdm import tqdm\n",
"import re\n"
],
"metadata": {
"id": "OlXc31-OPKrF"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pdf_path = \"/content/Science_E_prim6_TR1.pdf\"\n",
"output_csv = \"output_text_prime6_en.csv\"\n",
"output_stats = \"output_stats_prime6_en.csv\"\n",
"output_txt = \"output_text_prime6_en.txt\"\n",
"images_dir = \"pdf_images_prime6_en\"\n",
"batch_size = 50\n",
"low_word_threshold = 10\n",
"\n",
"os.makedirs(images_dir, exist_ok=True)"
],
"metadata": {
"id": "u1W69-mMyhMG"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"📄 Converting PDF to images...\")\n",
"pages = convert_from_path(pdf_path, dpi=300)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2b682e60-e7e8-42de-f4c8-89bbb113cc45",
"id": "C_LneoyryryP"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"📄 Converting PDF to images...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"image_paths = []\n",
"for i, page in enumerate(pages, start=1):\n",
" img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
" page.save(img_path, \"PNG\")\n",
" image_paths.append(img_path)\n"
],
"metadata": {
"id": "wgX68ISuyvIn"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def clean_text(text):\n",
" text = str(text)\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
" text = re.sub(r\"[^\\x00-\\x7F\\u0600-\\u06FF]+\", \" \", text)\n",
" return text.strip()"
],
"metadata": {
"id": "IRBX4I2bhZng"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
"results = []\n",
"\n",
"for batch_start in range(0, len(image_paths), batch_size):\n",
" batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
" for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
" page_number = batch_start + i + 1\n",
"\n",
" img = cv2.imread(img_path)\n",
"\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"\n",
" _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
"\n",
" text = pytesseract.image_to_string(thresh, lang=\"eng\")\n",
"\n",
" cleaned_text = clean_text(text)\n",
"\n",
" results.append({\"page\": page_number, \"text\": cleaned_text})\n"
],
"metadata": {
"id": "ZAwUNKFkyz7m",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "3625b751-cd0e-426a-cb12-f9495058d18c"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"🔎 Running OCR on pages... (Batch mode)\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"Batch 1: 100%|██████████| 50/50 [03:36<00:00, 4.34s/it]\n",
"Batch 2: 100%|██████████| 50/50 [03:43<00:00, 4.47s/it]\n",
"Batch 3: 100%|██████████| 38/38 [02:54<00:00, 4.60s/it]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(\"💾 Saving results...\")\n",
"\n",
"df = pd.DataFrame(results)\n",
"df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
" for _, row in df.iterrows():\n",
" f.write(f\"\\n--- Page {row['page']} ---\\n{row['text']}\\n\")\n"
],
"metadata": {
"id": "wA9oIFuMy3gA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9a3bbab0-f1a0-4b82-b8f8-c6d3ce765eac"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"💾 Saving results...\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
"stats = {\n",
" \"total_pages\": len(df),\n",
" \"total_words\": df[\"word_count\"].sum(),\n",
" \"avg_words_per_page\": df[\"word_count\"].mean()\n",
"}\n",
"stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
"stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ Done!\")\n",
"print(f\"📂 Text file saved: {output_csv}\")\n",
"print(f\"📂 Clean text file saved: {output_txt}\")\n",
"print(f\"📊 Stats file saved: {output_stats}\")"
],
"metadata": {
"id": "CPYgXgLMy6Xh",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ddd6ce3b-e481-4c9d-f33a-f485ebbd7d27"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Done!\n",
"📂 Text file saved: output_text_prime6_en.csv\n",
"📂 Clean text file saved: output_text_prime6_en.txt\n",
"📊 Stats file saved: output_stats_prime6_en.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import re\n",
"\n",
"def super_clean_text_en(text):\n",
" text = str(text)\n",
"\n",
" text = re.sub(r\"[^a-zA-Z0-9\\s\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
"\n",
" text = re.sub(r\"\\b(\\w+)( \\1){2,}\\b\", r\"\\1\", text)\n",
"\n",
" text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
"\n",
" text = re.sub(r\"(\\d)\\1{2,}\", r\"\\1\\1\\1\", text)\n",
"\n",
" words = text.split()\n",
" cleaned_words = []\n",
" for w in words:\n",
" if len(w) >= 15:\n",
" continue\n",
" cleaned_words.append(w)\n",
" text = \" \".join(cleaned_words)\n",
"\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
"\n",
" lines = text.split(\".\")\n",
" seen = set()\n",
" cleaned_lines = []\n",
" for line in lines:\n",
" l = line.strip()\n",
" if l and l not in seen:\n",
" cleaned_lines.append(l)\n",
" seen.add(l)\n",
" text = \". \".join(cleaned_lines)\n",
"\n",
" return text.strip()\n",
"\n",
"df = pd.read_csv(\"/content/output_text_prime6_en.csv\")\n",
"\n",
"df[\"text\"] = df[\"text\"].apply(super_clean_text_en)\n",
"\n",
"df.to_csv(\"output_text_en_cleaned.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "37b3fbf7-f5dc-4d71-b09a-c4c5326a4321",
"id": "kko7q5b7VHP5"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"# ---------------- structure ----------------\n",
"structure = {\n",
" \"Unit 1: What is system\": {\n",
" \"Get Started\": (9, 12),\n",
" \"Concepts\": {\n",
" \"Concept 1.1 The cell as a system\": {\n",
" \"Lesson 1\": (13, 17),\n",
" \"Lesson 2\": (18, 20),\n",
" \"Lesson 3\": (21, 23),\n",
" \"Lesson 4\": (24, 27),\n",
" \"Lesson 5\": (28, 30),\n",
" \"Lesson 6\": (31, 34),\n",
" },\n",
" \"Concept 1.2 The body as a system\": {\n",
" \"Lesson 1\": (35, 38),\n",
" \"Lesson 2\": (39, 41),\n",
" \"Lesson 3\": (42, 45),\n",
" \"Lesson 4\": (46, 48),\n",
" \"Lesson 5\": (49, 51),\n",
" \"Lesson 6\": (52, 54),\n",
" },\n",
" \"Concept 1.3 Energy as a system\": {\n",
" \"Lesson 1\": (55, 57),\n",
" \"Lesson 2\": (58, 60),\n",
" \"Lesson 3\": (61, 65),\n",
" \"Lesson 4\": (66, 67),\n",
" \"Lesson 5\": (68, 71),\n",
" \"Lesson 6\": (72, 73),\n",
" }\n",
" },\n",
" \"Unit Project\": (74, 76),\n",
" \"Unit assessment\": (77, 80),\n",
" },\n",
" \"Unit 2: Getting energy\": {\n",
" \"Get Started\": (81, 84),\n",
" \"Concepts\": {\n",
" \"Concept 2.1 Thermal energy and states of matter\": {\n",
" \"Lesson 1\": (85, 88),\n",
" \"Lesson 2\": (89, 91),\n",
" \"Lesson 3\": (92, 93),\n",
" \"Lesson 4\": (94, 95),\n",
" \"Lesson 5\": (96, 98),\n",
" \"Lesson 6\": (99, 100),\n",
" },\n",
" \"Concept 2.2 Heat transfer\": {\n",
" \"Lesson 1\": (101, 104),\n",
" \"Lesson 2\": (105, 108),\n",
" \"Lesson 3\": (109, 110),\n",
" \"Lesson 4\": (111, 113),\n",
" \"Lesson 5\": (114, 116),\n",
" \"Lesson 6\": (117, 120),\n",
" },\n",
" },\n",
" \"Unit Project\": (121, 122),\n",
" \"Interdisciplinary project\": (123, 131),\n",
" \"Unit assessment\": (132, 135) # توحيد الاسم\n",
" },\n",
"}\n",
"\n",
"# ---------------- load OCR text ----------------\n",
"df = pd.read_csv(\"/content/output_text_en_cleaned.csv\")\n",
"df_raw = pd.read_csv(\"/content/output_text_prime6_en.csv\")\n",
"df[\"page\"] = df_raw[\"page\"]\n",
"\n",
"rows = []\n",
"\n",
"# ---------------- split text ----------------\n",
"for unit, udata in structure.items():\n",
" # Get Started\n",
" if \"Get Started\" in udata:\n",
" start, end = udata[\"Get Started\"]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": \"Get Started\",\n",
" \"Lesson\": \"Get Started\",\n",
" \"From page\": start,\n",
" \"To page\": end,\n",
" \"Lesson text\": text\n",
" })\n",
"\n",
" # Concepts & Lessons\n",
" if \"Concepts\" in udata:\n",
" for concept, lessons in udata[\"Concepts\"].items():\n",
" for lesson, (ls, le) in lessons.items():\n",
" lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": concept,\n",
" \"Lesson\": lesson,\n",
" \"From page\": ls,\n",
" \"To page\": le,\n",
" \"Lesson text\": lesson_text\n",
" })\n",
"\n",
" # Other sections\n",
" for section in [\"Unit Project\", \"Unit assessment\", \"Interdisciplinary project\"]:\n",
" if section in udata:\n",
" start, end = udata[section]\n",
" text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
" rows.append({\n",
" \"Unit\": unit,\n",
" \"Concept\": section,\n",
" \"Lesson\": section,\n",
" \"From page\": start,\n",
" \"To page\": end,\n",
" \"Lesson text\": text\n",
" })\n",
"\n",
"# ---------------- save output ----------------\n",
"df_out = pd.DataFrame(rows)\n",
"df_out.to_csv(\"output_units_lessons_prime6_EN.csv\", index=False, encoding=\"utf-8-sig\")\n",
"\n",
"print(\"✅ Done! Lessons & units split and saved to output_units_lessons_prime6_EN.csv\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pICh8K7rnZyd",
"outputId": "22525901-3cc3-4103-b889-e9c4410fd46d"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Done! Lessons & units split and saved to output_units_lessons_prime6_EN.csv\n"
]
}
]
}
]
}
\ No newline at end of file
metric,value
total_pages,138.0
total_words,28455.0
avg_words_per_page,206.19565217391303
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment