Added extracted PDF data for Arabic and English curricula

7ecadde7 · arwa mohamed · ade545c7 · 7ecadde7 · 7ecadde7 · 7ecadde7
Commit 7ecadde7 authored Sep 10, 2025 by arwa mohamed
24 changed files
--- a/Data/arabic/prime4/arabic_extract_prime4.ipynb
+++ b/Data/arabic/prime4/arabic_extract_prime4.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "WlPS1sR4XPT2"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        ">prime4\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "gJWbo3uD8u_4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!apt-get install tesseract-ocr\n",
+        "!pip install pdfplumber pdf2image pytesseract\n",
+        "!apt-get install -y poppler-utils\n",
+        "!sudo apt install tesseract-ocr -y\n",
+        "!sudo apt install tesseract-ocr-ara -y\n",
+        "!sudo apt install libtesseract-dev -y\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mXFZpTwLOjTZ",
+        "outputId": "a07e982d-7589-473a-c07e-baef61a227ed"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Collecting pdfplumber\n",
+            "  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pdf2image\n",
+            "  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
+            "Collecting pytesseract\n",
+            "  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
+            "Collecting pdfminer.six==20250506 (from pdfplumber)\n",
+            "  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
+            "Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
+            "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
+            "  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
+            "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
+            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
+            "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
+            "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
+            "Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
+            "Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
+            "Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m77.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
+            "Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following NEW packages will be installed:\n",
+            "  poppler-utils\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 186 kB of archives.\n",
+            "After this operation, 697 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
+            "Fetched 186 kB in 1s (127 kB/s)\n",
+            "Selecting previously unselected package poppler-utils.\n",
+            "(Reading database ... 126374 files and directories currently installed.)\n",
+            "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
+            "Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following NEW packages will be installed:\n",
+            "  tesseract-ocr-ara\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 645 kB of archives.\n",
+            "After this operation, 1,447 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ara all 1:4.00~git30-7274cfa-1.1 [645 kB]\n",
+            "Fetched 645 kB in 2s (410 kB/s)\n",
+            "debconf: unable to initialize frontend: Dialog\n",
+            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
+            "debconf: falling back to frontend: Readline\n",
+            "debconf: unable to initialize frontend: Readline\n",
+            "debconf: (This frontend requires a controlling tty.)\n",
+            "debconf: falling back to frontend: Teletype\n",
+            "dpkg-preconfigure: unable to re-open stdin: \n",
+            "Selecting previously unselected package tesseract-ocr-ara.\n",
+            "(Reading database ... 126404 files and directories currently installed.)\n",
+            "Preparing to unpack .../tesseract-ocr-ara_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+            "Unpacking tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following additional packages will be installed:\n",
+            "  libarchive-dev libleptonica-dev\n",
+            "The following NEW packages will be installed:\n",
+            "  libarchive-dev libleptonica-dev libtesseract-dev\n",
+            "0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 3,743 kB of archives.\n",
+            "After this operation, 16.0 MB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]\n",
+            "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]\n",
+            "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]\n",
+            "Fetched 3,743 kB in 2s (1,635 kB/s)\n",
+            "debconf: unable to initialize frontend: Dialog\n",
+            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n",
+            "debconf: falling back to frontend: Readline\n",
+            "debconf: unable to initialize frontend: Readline\n",
+            "debconf: (This frontend requires a controlling tty.)\n",
+            "debconf: falling back to frontend: Teletype\n",
+            "dpkg-preconfigure: unable to re-open stdin: \n",
+            "Selecting previously unselected package libarchive-dev:amd64.\n",
+            "(Reading database ... 126408 files and directories currently installed.)\n",
+            "Preparing to unpack .../libarchive-dev_3.6.0-1ubuntu1.5_amd64.deb ...\n",
+            "Unpacking libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
+            "Selecting previously unselected package libleptonica-dev.\n",
+            "Preparing to unpack .../libleptonica-dev_1.82.0-3build1_amd64.deb ...\n",
+            "Unpacking libleptonica-dev (1.82.0-3build1) ...\n",
+            "Selecting previously unselected package libtesseract-dev:amd64.\n",
+            "Preparing to unpack .../libtesseract-dev_4.1.1-2.1build1_amd64.deb ...\n",
+            "Unpacking libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
+            "Setting up libleptonica-dev (1.82.0-3build1) ...\n",
+            "Setting up libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
+            "Setting up libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import cv2\n",
+        "import pytesseract\n",
+        "import pandas as pd\n",
+        "from pdf2image import convert_from_path\n",
+        "from tqdm import tqdm\n",
+        "import re\n"
+      ],
+      "metadata": {
+        "id": "OlXc31-OPKrF"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pdf_path = \"/content/Science_AR_prim4_TR1.pdf\"\n",
+        "output_csv = \"output_text_prime4_ar.csv\"\n",
+        "output_stats = \"output_stats_prime4_ar.csv\"\n",
+        "output_txt = \"output_text_prime4_ar.txt\"\n",
+        "images_dir = \"pdf_images_prime4_ar\"\n",
+        "batch_size = 50\n",
+        "low_word_threshold = 10\n",
+        "\n",
+        "os.makedirs(images_dir, exist_ok=True)"
+      ],
+      "metadata": {
+        "id": "TO9y9WyOPN5l"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"📄 Converting PDF to images...\")\n",
+        "pages = convert_from_path(pdf_path, dpi=300)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FERBX8NtPUbM",
+        "outputId": "a1798c91-64d3-407e-9b6c-6d48de46b050"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "📄 Converting PDF to images...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "image_paths = []\n",
+        "for i, page in enumerate(pages, start=1):\n",
+        "    img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
+        "    page.save(img_path, \"PNG\")\n",
+        "    image_paths.append(img_path)\n"
+      ],
+      "metadata": {
+        "id": "y0UD8m4oPXDz"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def clean_text(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
+        "    text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
+        "    text = re.sub(r\"[_]{2,}\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "\n",
+        "    words = text.strip().split()\n",
+        "    if len(words) <= 2:\n",
+        "        return \"\"\n",
+        "\n",
+        "    return text.strip()\n"
+      ],
+      "metadata": {
+        "id": "o7u4lhRN1OjG"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def deep_clean_text(text):\n",
+        "\n",
+        "    text = re.sub(r\"\\b\\d{4,}\\b\", \" \", text)\n",
+        "\n",
+        "\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    lines = text.split()\n",
+        "    lines = [line for line in lines if not re.fullmatch(r\"[\\d\\W]+\", line)]\n",
+        "\n",
+        "    return \" \".join(lines).strip()\n"
+      ],
+      "metadata": {
+        "id": "ZNg2aeZ032qR"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
+        "results = []\n",
+        "\n",
+        "for batch_start in range(0, len(image_paths), batch_size):\n",
+        "    batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
+        "    for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
+        "        page_number = batch_start + i + 1\n",
+        "\n",
+        "        img = cv2.imread(img_path)\n",
+        "\n",
+        "        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+        "\n",
+        "        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
+        "\n",
+        "        text = pytesseract.image_to_string(thresh, lang=\"ara\")\n",
+        "\n",
+        "        cleaned_text = clean_text(text)\n",
+        "        cleaned_text = deep_clean_text(cleaned_text)\n",
+        "\n",
+        "\n",
+        "        if cleaned_text:\n",
+        "            results.append({\"page\": page_number, \"text\": cleaned_text})"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "_51dWZ7c1Pyv",
+        "outputId": "221396c0-6b91-43bb-8051-4941f78d6416"
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "🔎 Running OCR on pages... (Batch mode)\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Batch 1: 100%|██████████| 50/50 [04:00<00:00,  4.82s/it]\n",
+            "Batch 2: 100%|██████████| 50/50 [03:32<00:00,  4.24s/it]\n",
+            "Batch 3: 100%|██████████| 28/28 [02:02<00:00,  4.36s/it]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "print(\"💾 Saving results...\")\n",
+        "\n",
+        "df = pd.DataFrame(results)\n",
+        "df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
+        "    for _, row in df.iterrows():\n",
+        "        f.write(f\"\\n--- الصفحة {row['page']} ---\\n{row['text']}\\n\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "jD3jAZIG1far",
+        "outputId": "ab2c84ae-a077-4e9d-95be-5077e4552df8"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "💾 Saving results...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
+        "stats = {\n",
+        "    \"total_pages\": len(df),\n",
+        "    \"total_words\": df[\"word_count\"].sum(),\n",
+        "    \"avg_words_per_page\": round(df[\"word_count\"].mean(), 2)\n",
+        "}\n",
+        "stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
+        "stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ انتهى OCR للعربي!\")\n",
+        "print(f\"📂 ملف CSV: {output_csv}\")\n",
+        "print(f\"📂 ملف TXT: {output_txt}\")\n",
+        "print(f\"📊 ملف الإحصائيات: {output_stats}\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Uq0XYAw11hZY",
+        "outputId": "67d8ff69-e9ff-4d93-ccf9-ba122f8ec7de"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ انتهى OCR للعربي!\n",
+            "📂 ملف CSV: output_text_prime4_ar.csv\n",
+            "📂 ملف TXT: output_text_prime4_ar.txt\n",
+            "📊 ملف الإحصائيات: output_stats_prime4_ar.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import re\n",
+        "\n",
+        "def super_clean_text(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\b\\d{3,}\\b\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
+        "    text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
+        "    text = re.sub(r\"[_]{2,}\", \" \", text)\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    words = text.split()\n",
+        "    words = [w for w in words if not re.fullmatch(r\"[\\d\\W]+\", w)]\n",
+        "\n",
+        "    text = \" \".join(words)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
+        "\n",
+        "    return text\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
+        "\n",
+        "df_cleaned = pd.DataFrame()\n",
+        "df_cleaned[\"text\"] = df[\"text\"].apply(super_clean_text)\n",
+        "\n",
+        "df_cleaned.to_csv(\"output_text_ar_cleaned_prime4.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\")\n"
+      ],
+      "metadata": {
+        "id": "tNcXdjtL6PWe",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "f29c6c84-710a-47d2-99b4-9ff310b5f302"
+      },
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "structure = {\n",
+        "    \"الوحدة الأولى: الأنظمة الحية\": {\n",
+        "           \"مقدمة\": (9, 12),\n",
+        "        \"المفاهيم\": {\n",
+        "            \"المفهوم 1.1: التكيف والبقاء\": {\n",
+        "                \"الدرس الأول\": (13, 16),\n",
+        "                \"الدرس الثاني\": (17, 22),\n",
+        "                \"الدرس الثالث\": (23, 29),\n",
+        "                \"الدرس الرابع\": (30, 31),\n",
+        "                \"الدرس الخامس\": (32, 34),\n",
+        "            },\n",
+        "             \"المفهوم 2.1: كيف تعمل الحواس؟\": {\n",
+        "                \"الدرس الأول\": (35, 38),\n",
+        "                \"الدرس الثاني\": (39, 41),\n",
+        "                \"الدرس الثالث\": (42, 44),\n",
+        "                \"الدرس الرابع\": (45, 48),\n",
+        "            },\n",
+        "                \"المفهوم 3.1: الضوء وحاسة البصر\": {\n",
+        "                \"الدرس الأول\": (49, 54),\n",
+        "                \"الدرس الثاني\": (55, 58),\n",
+        "                \"الدرس الثالث\": (59, 60),\n",
+        "                \"الدرس الرابع\": (61, 62),\n",
+        "            }\n",
+        "        },\n",
+        "        \"مشروع الوحدة\": (63, 64),\n",
+        "        \"المشروع بيني التخصصات\": (65, 72),\n",
+        "        \"قيم تعلمك\": (73, 74)\n",
+        "    },\n",
+        "    \"الوحدة الثانية: الحركة والطاقة\": {\n",
+        "        \"مقدمة\": (75, 78),\n",
+        "        \"المفاهيم\": {\n",
+        "            \"المفهوم 1.2: الحركة والتوقف\": {\n",
+        "                \"الدرس الأول\": (79, 82),\n",
+        "                \"الدرس الثاني\": (83, 85),\n",
+        "                \"الدرس الثالث\": (86, 89),\n",
+        "                \"الدرس الرابع\": (90, 92),\n",
+        "            },\n",
+        "            \"المفهوم 2.2: الطاقة والحركة\": {\n",
+        "                \"الدرس الأول\": (93, 96),\n",
+        "                \"الدرس الثاني\": (97, 98),\n",
+        "                \"الدرس الثالث\": (99, 102),\n",
+        "                \"الدرس الرابع\": (103, 104),\n",
+        "            },\n",
+        "            \"المفهوم 3.2: الطاقة والتصادم\": {\n",
+        "                \"الدرس الأول\": (105, 109),\n",
+        "                \"الدرس الثاني\": (110, 113),\n",
+        "                \"الدرس الثالث\": (114, 115),\n",
+        "                \"الدرس الرابع\": (116, 120),\n",
+        "            }\n",
+        "\n",
+        "        },\n",
+        "         \"مشروع الوحدة\": (121, 122),\n",
+        "        \"قيم تعلمك\": (123, 124),\n",
+        "        \"السلامة في فصول العلوم\": (125, 126)\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_ar_cleaned_prime4.csv\")\n",
+        "df_raw = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
+        "df[\"page\"] = df_raw[\"page\"]\n",
+        "\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_ar_cleaned_prime4.csv\")\n",
+        "df_raw = pd.read_csv(\"/content/output_text_prime4_ar.csv\")\n",
+        "df[\"page\"] = df_raw[\"page\"]\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "for unit, udata in structure.items():\n",
+        "    if \"مقدمة\" in udata:\n",
+        "        start, end = udata[\"مقدمة\"]\n",
+        "        text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "        rows.append({\n",
+        "            \"الوحدة\": unit,\n",
+        "            \"المفهوم\": \"مقدمة\",\n",
+        "            \"الدرس\": \"مقدمة\",\n",
+        "            \"من صفحة\": start,\n",
+        "            \"إلى صفحة\": end,\n",
+        "            \"النص\": text\n",
+        "        })\n",
+        "\n",
+        "    if \"المفاهيم\" in udata:\n",
+        "        for concept, lessons in udata[\"المفاهيم\"].items():\n",
+        "            for lesson, (ls, le) in lessons.items():\n",
+        "                lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
+        "                rows.append({\n",
+        "                    \"الوحدة\": unit,\n",
+        "                    \"المفهوم\": concept,\n",
+        "                    \"الدرس\": lesson,\n",
+        "                    \"من صفحة\": ls,\n",
+        "                    \"إلى صفحة\": le,\n",
+        "                    \"النص\": lesson_text\n",
+        "                })\n",
+        "\n",
+        "    for section in [\"مشروع الوحدة\", \"المشروع بيني التخصصات\", \"قيم تعلمك\", \"السلامة في فصول العلوم\"]:\n",
+        "        if section in udata:\n",
+        "            start, end = udata[section]\n",
+        "            text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "            rows.append({\n",
+        "                \"الوحدة\": unit,\n",
+        "                \"المفهوم\": section,\n",
+        "                \"الدرس\": section,\n",
+        "                \"من صفحة\": start,\n",
+        "                \"إلى صفحة\": end,\n",
+        "                \"النص\": text\n",
+        "            })\n",
+        "\n",
+        "\n",
+        "df_out = pd.DataFrame(rows)\n",
+        "df_out.to_csv(\"output_units_lessons_prime4.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ تم تقسيم النصوص للوحدات والدروس وحفظها في output_units_lessons_prime4.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7tm5OFaHFL_c",
+        "outputId": "f788fe87-079d-47b0-f262-f0a94d4fc381"
+      },
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ تم تقسيم النصوص للوحدات والدروس وحفظها في output_units_lessons_prime4.csv\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/Data/arabic/prime4/output_stats_prime4_ar.csv
+++ b/Data/arabic/prime4/output_stats_prime4_ar.csv
+metric,value
+total_pages,126.0
+total_words,17123.0
+avg_words_per_page,135.9
--- a/Data/arabic/prime4/output_text_ar_cleaned_prime4.csv
+++ b/Data/arabic/prime4/output_text_ar_cleaned_prime4.csv
--- a/Data/arabic/prime4/output_text_prime4_ar.csv
+++ b/Data/arabic/prime4/output_text_prime4_ar.csv
--- a/Data/arabic/prime4/output_text_prime4_ar.txt
+++ b/Data/arabic/prime4/output_text_prime4_ar.txt
--- a/Data/arabic/prime4/output_units_lessons_prime4.csv
+++ b/Data/arabic/prime4/output_units_lessons_prime4.csv
--- a/Data/arabic/prime6/arabic_extract_prime6.ipynb
+++ b/Data/arabic/prime6/arabic_extract_prime6.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "prime6"
+      ],
+      "metadata": {
+        "id": "ksTOxAhBBfel"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!apt-get install tesseract-ocr\n",
+        "!pip install pdfplumber pdf2image pytesseract\n",
+        "!apt-get install -y poppler-utils\n",
+        "!sudo apt install tesseract-ocr -y\n",
+        "!sudo apt install tesseract-ocr-ara -y\n",
+        "!sudo apt install libtesseract-dev -y\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mXFZpTwLOjTZ",
+        "outputId": "c5f27dcf-36ab-4d1b-fde5-18a645d0c70e"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Collecting pdfplumber\n",
+            "  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pdf2image\n",
+            "  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
+            "Collecting pytesseract\n",
+            "  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
+            "Collecting pdfminer.six==20250506 (from pdfplumber)\n",
+            "  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
+            "Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
+            "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
+            "  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
+            "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
+            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
+            "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
+            "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
+            "Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m38.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
+            "Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
+            "Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m77.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
+            "Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following NEW packages will be installed:\n",
+            "  poppler-utils\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 186 kB of archives.\n",
+            "After this operation, 697 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
+            "Fetched 186 kB in 1s (137 kB/s)\n",
+            "Selecting previously unselected package poppler-utils.\n",
+            "(Reading database ... 126374 files and directories currently installed.)\n",
+            "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
+            "Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following NEW packages will be installed:\n",
+            "  tesseract-ocr-ara\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 645 kB of archives.\n",
+            "After this operation, 1,447 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ara all 1:4.00~git30-7274cfa-1.1 [645 kB]\n",
+            "Fetched 645 kB in 2s (324 kB/s)\n",
+            "debconf: unable to initialize frontend: Dialog\n",
+            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n",
+            "debconf: falling back to frontend: Readline\n",
+            "debconf: unable to initialize frontend: Readline\n",
+            "debconf: (This frontend requires a controlling tty.)\n",
+            "debconf: falling back to frontend: Teletype\n",
+            "dpkg-preconfigure: unable to re-open stdin: \n",
+            "Selecting previously unselected package tesseract-ocr-ara.\n",
+            "(Reading database ... 126404 files and directories currently installed.)\n",
+            "Preparing to unpack .../tesseract-ocr-ara_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+            "Unpacking tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr-ara (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following additional packages will be installed:\n",
+            "  libarchive-dev libleptonica-dev\n",
+            "The following NEW packages will be installed:\n",
+            "  libarchive-dev libleptonica-dev libtesseract-dev\n",
+            "0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 3,743 kB of archives.\n",
+            "After this operation, 16.0 MB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]\n",
+            "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]\n",
+            "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]\n",
+            "Fetched 3,743 kB in 3s (1,166 kB/s)\n",
+            "debconf: unable to initialize frontend: Dialog\n",
+            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n",
+            "debconf: falling back to frontend: Readline\n",
+            "debconf: unable to initialize frontend: Readline\n",
+            "debconf: (This frontend requires a controlling tty.)\n",
+            "debconf: falling back to frontend: Teletype\n",
+            "dpkg-preconfigure: unable to re-open stdin: \n",
+            "Selecting previously unselected package libarchive-dev:amd64.\n",
+            "(Reading database ... 126408 files and directories currently installed.)\n",
+            "Preparing to unpack .../libarchive-dev_3.6.0-1ubuntu1.5_amd64.deb ...\n",
+            "Unpacking libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
+            "Selecting previously unselected package libleptonica-dev.\n",
+            "Preparing to unpack .../libleptonica-dev_1.82.0-3build1_amd64.deb ...\n",
+            "Unpacking libleptonica-dev (1.82.0-3build1) ...\n",
+            "Selecting previously unselected package libtesseract-dev:amd64.\n",
+            "Preparing to unpack .../libtesseract-dev_4.1.1-2.1build1_amd64.deb ...\n",
+            "Unpacking libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
+            "Setting up libleptonica-dev (1.82.0-3build1) ...\n",
+            "Setting up libarchive-dev:amd64 (3.6.0-1ubuntu1.5) ...\n",
+            "Setting up libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import cv2\n",
+        "import pytesseract\n",
+        "import pandas as pd\n",
+        "from pdf2image import convert_from_path\n",
+        "from tqdm import tqdm\n",
+        "import re\n"
+      ],
+      "metadata": {
+        "id": "OlXc31-OPKrF"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pdf_path = \"/content/Science_AR_prim6_TR1.pdf\"\n",
+        "output_csv = \"output_text_prime6_ar.csv\"\n",
+        "output_stats = \"output_stats_prime6_ar.csv\"\n",
+        "output_txt = \"output_text_prime6_ar.txt\"\n",
+        "images_dir = \"pdf_images_prime6_ar\"\n",
+        "batch_size = 50\n",
+        "low_word_threshold = 10\n",
+        "\n",
+        "os.makedirs(images_dir, exist_ok=True)"
+      ],
+      "metadata": {
+        "id": "104x_y2OBp7N"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"📄 Converting PDF to images...\")\n",
+        "pages = convert_from_path(pdf_path, dpi=300)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "50fd1372-0159-4de7-a7fc-e4b2d73a1965",
+        "id": "sCdYDu_EB1aV"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "📄 Converting PDF to images...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "image_paths = []\n",
+        "for i, page in enumerate(pages, start=1):\n",
+        "    img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
+        "    page.save(img_path, \"PNG\")\n",
+        "    image_paths.append(img_path)\n"
+      ],
+      "metadata": {
+        "id": "MpPApbwEB5FG"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def clean_text(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
+        "    text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
+        "    text = re.sub(r\"[_]{2,}\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "\n",
+        "    words = text.strip().split()\n",
+        "    if len(words) <= 2:\n",
+        "        return \"\"\n",
+        "\n",
+        "    return text.strip()\n"
+      ],
+      "metadata": {
+        "id": "UEerCWkVB8tF"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def deep_clean_text(text):\n",
+        "    text = re.sub(r\"\\b\\d{4,}\\b\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    lines = text.split()\n",
+        "    lines = [line for line in lines if not re.fullmatch(r\"[\\d\\W]+\", line)]\n",
+        "\n",
+        "    return \" \".join(lines).strip()\n"
+      ],
+      "metadata": {
+        "id": "PjArDwlKCAVu"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
+        "results = []\n",
+        "\n",
+        "for batch_start in range(0, len(image_paths), batch_size):\n",
+        "    batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
+        "    for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
+        "        page_number = batch_start + i + 1\n",
+        "\n",
+        "        img = cv2.imread(img_path)\n",
+        "\n",
+        "        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+        "\n",
+        "        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
+        "\n",
+        "        text = pytesseract.image_to_string(thresh, lang=\"ara\")\n",
+        "\n",
+        "        cleaned_text = clean_text(text)\n",
+        "        cleaned_text = deep_clean_text(cleaned_text)\n",
+        "\n",
+        "\n",
+        "        if cleaned_text:\n",
+        "            results.append({\"page\": page_number, \"text\": cleaned_text})"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "8ef4f4a3-e725-4c2b-8a05-4bde004fc152",
+        "id": "uHC2MD9XCHq1"
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "🔎 Running OCR on pages... (Batch mode)\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Batch 1: 100%|██████████| 50/50 [04:11<00:00,  5.03s/it]\n",
+            "Batch 2: 100%|██████████| 50/50 [04:25<00:00,  5.32s/it]\n",
+            "Batch 3: 100%|██████████| 37/37 [03:40<00:00,  5.96s/it]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "print(\"💾 Saving results...\")\n",
+        "\n",
+        "df = pd.DataFrame(results)\n",
+        "df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
+        "    for _, row in df.iterrows():\n",
+        "        f.write(f\"\\n--- الصفحة {row['page']} ---\\n{row['text']}\\n\")"
+      ],
+      "metadata": {
+        "id": "x1rPk23yCNzN",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "943af302-80c8-4ca7-9a48-7ad802a38cd4"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "💾 Saving results...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
+        "stats = {\n",
+        "    \"total_pages\": len(df),\n",
+        "    \"total_words\": df[\"word_count\"].sum(),\n",
+        "    \"avg_words_per_page\": round(df[\"word_count\"].mean(), 2)\n",
+        "}\n",
+        "stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
+        "stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ انتهى OCR للعربي!\")\n",
+        "print(f\"📂 ملف CSV: {output_csv}\")\n",
+        "print(f\"📂 ملف TXT: {output_txt}\")\n",
+        "print(f\"📊 ملف الإحصائيات: {output_stats}\")"
+      ],
+      "metadata": {
+        "id": "PZ0AB3cGCQ2G",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "a037f0ad-5a1a-4a42-9233-6c2a42673178"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ انتهى OCR للعربي!\n",
+            "📂 ملف CSV: output_text_prime6_ar.csv\n",
+            "📂 ملف TXT: output_text_prime6_ar.txt\n",
+            "📊 ملف الإحصائيات: output_stats_prime6_ar.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import re\n",
+        "\n",
+        "def super_clean_text(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^ \\u0600-\\u06FFa-zA-Z0-9\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\b\\d{3,}\\b\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"[\\.]{2,}\", \".\", text)\n",
+        "    text = re.sub(r\"[\\-]{2,}\", \"-\", text)\n",
+        "    text = re.sub(r\"[_]{2,}\", \" \", text)\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    words = text.split()\n",
+        "    words = [w for w in words if not re.fullmatch(r\"[\\d\\W]+\", w)]\n",
+        "\n",
+        "    text = \" \".join(words)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
+        "\n",
+        "    return text\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_prime6_ar.csv\")\n",
+        "df_cleaned = pd.DataFrame()\n",
+        "df_cleaned[\"text\"] = df[\"text\"].apply(super_clean_text)\n",
+        "\n",
+        "df_cleaned.to_csv(\"output_text_ar_cleaned_prime6.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "c4e0009f-ee32-4bde-de03-1c2d3ee1aa4e",
+        "id": "ZQHx0Z6VCTkP"
+      },
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ النصوص اتنضفت وحُفظت في output_text_ar_cleaned.csv (عمود واحد بس)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "structure = {\n",
+        "    \"الوحدة الأولى: ما النظام؟\": {\n",
+        "        \"مقدمة\": (8, 11),\n",
+        "        \"المفاهيم\": {\n",
+        "            \"المفهوم 1.1: الخليه كنظام\": {\n",
+        "                \"الدرس الأول\": (12, 16),\n",
+        "                \"الدرس الثاني\": (17, 19),\n",
+        "                \"الدرس الثالث\": (20, 22),\n",
+        "                \"الدرس الرابع\": (23, 26),\n",
+        "                \"الدرس الخامس\": (27, 29),\n",
+        "                \"الدرس السادس\": (30, 33)\n",
+        "\n",
+        "            },\n",
+        "            \"المفهوم 2.1: الجسم كنظام\": {\n",
+        "                \"الدرس الأول\": (34, 37),\n",
+        "                \"الدرس الثاني\": (38, 40),\n",
+        "                \"الدرس الثالث\": (41, 44),\n",
+        "                \"الدرس الرابع\": (45, 47),\n",
+        "                \"الدرس الخامس\": (48, 50),\n",
+        "                \"الدرس السادس\": (51, 53)\n",
+        "\n",
+        "            },\n",
+        "            \"المفهوم 3.1: الطاقة كنظام\": {\n",
+        "                \"الدرس الأول\": (54, 56),\n",
+        "                \"الدرس الثاني\": (57, 59),\n",
+        "                \"الدرس الثالث\": (60, 64),\n",
+        "                \"الدرس الرابع\": (65, 66),\n",
+        "                \"الدرس الخامس\": (67, 70),\n",
+        "                \"الدرس السادس\": (71, 72)\n",
+        "\n",
+        "            },\n",
+        "        },\n",
+        "        \"مشروع الوحدة\": (73, 75),\n",
+        "        \"تقييم الوحدة\": (76, 79)\n",
+        "    },\n",
+        "\n",
+        "    \"الوحدة الثانية: الحصول على الطاقة\": {\n",
+        "        \"مقدمة\": (80, 83),\n",
+        "        \"المفاهيم\": {\n",
+        "            \"المفهوم 1.2: الطاقة الحرارية وحالات المادة\": {\n",
+        "                \"الدرس الأول\": (84, 88),\n",
+        "                \"الدرس الثاني\": (89, 91),\n",
+        "                \"الدرس الثالث\": (92, 93),\n",
+        "                \"الدرس الرابع\": (94, 95),\n",
+        "                \"الدرس الخامس\": (96, 98),\n",
+        "                \"الدرس السادس\": (99, 101),\n",
+        "\n",
+        "            },\n",
+        "            \"المفهوم 2.2: انتقال الحرارة\": {\n",
+        "                \"الدرس الأول\": (102, 105),\n",
+        "                \"الدرس الثاني\": (106, 109),\n",
+        "                \"الدرس الثالث\": (110, 111),\n",
+        "                \"الدرس الرابع\": (112, 113),\n",
+        "                \"الدرس الخامس\": (114, 116),\n",
+        "                \"الدرس السادس\": (117, 119),\n",
+        "\n",
+        "            }\n",
+        "        },\n",
+        "        \"مشروع الوحدة\": (120, 121),\n",
+        "        \"المشروع بيني التخصصات\": (122, 130),\n",
+        "        \"تقييم الوحدة\": (131, 134)\n",
+        "    }\n",
+        "}\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_ar_cleaned_prime6.csv\")\n",
+        "df_raw = pd.read_csv(\"/content/output_text_prime6_ar.csv\")\n",
+        "df[\"page\"] = df_raw[\"page\"]\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "for unit, udata in structure.items():\n",
+        "    if \"مقدمة\" in udata:\n",
+        "        start, end = udata[\"مقدمة\"]\n",
+        "        text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "        rows.append({\n",
+        "            \"الوحدة\": unit,\n",
+        "            \"المفهوم\": \"مقدمة\",\n",
+        "            \"الدرس\": \"مقدمة\",\n",
+        "            \"من صفحة\": start,\n",
+        "            \"إلى صفحة\": end,\n",
+        "            \"النص\": text\n",
+        "        })\n",
+        "\n",
+        "    if \"المفاهيم\" in udata:\n",
+        "        for concept, lessons in udata[\"المفاهيم\"].items():\n",
+        "            for lesson, (ls, le) in lessons.items():\n",
+        "                lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
+        "                rows.append({\n",
+        "                    \"الوحدة\": unit,\n",
+        "                    \"المفهوم\": concept,\n",
+        "                    \"الدرس\": lesson,\n",
+        "                    \"من صفحة\": ls,\n",
+        "                    \"إلى صفحة\": le,\n",
+        "                    \"النص\": lesson_text\n",
+        "                })\n",
+        "\n",
+        "    for section in [\"مشروع الوحدة\", \"المشروع بيني التخصصات\", \"قيم تعلمك\", \"السلامة في فصول العلوم\"]:\n",
+        "        if section in udata:\n",
+        "            start, end = udata[section]\n",
+        "            text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "            rows.append({\n",
+        "                \"الوحدة\": unit,\n",
+        "                \"المفهوم\": section,\n",
+        "                \"الدرس\": section,\n",
+        "                \"من صفحة\": start,\n",
+        "                \"إلى صفحة\": end,\n",
+        "                \"النص\": text\n",
+        "            })\n",
+        "\n",
+        "\n",
+        "\n",
+        "df_out = pd.DataFrame(rows)\n",
+        "df_out.to_csv(\"output_units_lessons_prime6.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ تم تقسيم النصوص للوحدات والدروس (الصف السادس) وحفظها في output_units_lessons_prime6.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WWIa8R-aQ_Kd",
+        "outputId": "5057cd82-79fc-4af6-fe18-5dc6358cc0e5"
+      },
+      "execution_count": 13,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ تم تقسيم النصوص للوحدات والدروس (الصف السادس) وحفظها في output_units_lessons_prime6.csv\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/Data/arabic/prime6/output_stats_prime6_ar.csv
+++ b/Data/arabic/prime6/output_stats_prime6_ar.csv
+metric,value
+total_pages,134.0
+total_words,21334.0
+avg_words_per_page,159.21
--- a/Data/arabic/prime6/output_text_ar_cleaned_prime6.csv
+++ b/Data/arabic/prime6/output_text_ar_cleaned_prime6.csv
--- a/Data/arabic/prime6/output_text_prime6_ar.csv
+++ b/Data/arabic/prime6/output_text_prime6_ar.csv
--- a/Data/arabic/prime6/output_text_prime6_ar.txt
+++ b/Data/arabic/prime6/output_text_prime6_ar.txt
--- a/Data/arabic/prime6/output_units_lessons_prime6.csv
+++ b/Data/arabic/prime6/output_units_lessons_prime6.csv
--- a/Data/english/prime5/english_extract_prime5.ipynb
+++ b/Data/english/prime5/english_extract_prime5.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ATlQdSWeYEwu"
+      },
+      "outputs": [],
+      "source": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!apt-get install tesseract-ocr\n",
+        "!pip install pdfplumber pdf2image pytesseract\n",
+        "!apt-get install -y poppler-utils\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mXFZpTwLOjTZ",
+        "outputId": "181e3c60-149b-4582-e882-17f6fbf16113"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Collecting pdfplumber\n",
+            "  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pdf2image\n",
+            "  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
+            "Collecting pytesseract\n",
+            "  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
+            "Collecting pdfminer.six==20250506 (from pdfplumber)\n",
+            "  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
+            "Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
+            "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
+            "  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
+            "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
+            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
+            "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
+            "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
+            "Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m79.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
+            "Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
+            "Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m101.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber\n",
+            "Successfully installed pdf2image-1.17.0 pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0 pytesseract-0.3.13\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following NEW packages will be installed:\n",
+            "  poppler-utils\n",
+            "0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Need to get 186 kB of archives.\n",
+            "After this operation, 697 kB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]\n",
+            "Fetched 186 kB in 1s (196 kB/s)\n",
+            "Selecting previously unselected package poppler-utils.\n",
+            "(Reading database ... 126374 files and directories currently installed.)\n",
+            "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...\n",
+            "Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Setting up poppler-utils (22.02.0-2ubuntu0.10) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import cv2\n",
+        "import pytesseract\n",
+        "import pandas as pd\n",
+        "from pdf2image import convert_from_path\n",
+        "from tqdm import tqdm\n",
+        "import re\n"
+      ],
+      "metadata": {
+        "id": "OlXc31-OPKrF"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pdf_path = \"/content/Science_EN_prim5_TR1 (1).pdf\"\n",
+        "output_csv = \"output_text.csv\"\n",
+        "output_stats = \"output_stats.csv\"\n",
+        "output_txt = \"output_text.txt\"\n",
+        "images_dir = \"pdf_images\"\n",
+        "batch_size = 50\n",
+        "low_word_threshold = 10\n",
+        "os.makedirs(images_dir, exist_ok=True)"
+      ],
+      "metadata": {
+        "id": "u1W69-mMyhMG"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"📄 Converting PDF to images...\")\n",
+        "pages = convert_from_path(pdf_path, dpi=300)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "a8e9b878-0c10-4a6b-ea99-92fc7caa1d99",
+        "id": "C_LneoyryryP"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "📄 Converting PDF to images...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "image_paths = []\n",
+        "for i, page in enumerate(pages, start=1):\n",
+        "    img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
+        "    page.save(img_path, \"PNG\")\n",
+        "    image_paths.append(img_path)\n"
+      ],
+      "metadata": {
+        "id": "wgX68ISuyvIn"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def clean_text(text):\n",
+        "    text = str(text)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "    text = re.sub(r\"[^\\x00-\\x7F\\u0600-\\u06FF]+\", \" \", text)\n",
+        "    return text.strip()"
+      ],
+      "metadata": {
+        "id": "IRBX4I2bhZng"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
+        "results = []\n",
+        "\n",
+        "for batch_start in range(0, len(image_paths), batch_size):\n",
+        "    batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
+        "    for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
+        "        page_number = batch_start + i + 1\n",
+        "\n",
+        "        img = cv2.imread(img_path)\n",
+        "\n",
+        "        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+        "\n",
+        "        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
+        "\n",
+        "        text = pytesseract.image_to_string(thresh, lang=\"eng\")\n",
+        "\n",
+        "        cleaned_text = clean_text(text)\n",
+        "\n",
+        "        results.append({\"page\": page_number, \"text\": cleaned_text})\n"
+      ],
+      "metadata": {
+        "id": "ZAwUNKFkyz7m",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "80adc825-c300-400f-9a8b-c52dc1f5c21b"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "🔎 Running OCR on pages... (Batch mode)\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Batch 1: 100%|██████████| 50/50 [02:54<00:00,  3.50s/it]\n",
+            "Batch 2: 100%|██████████| 50/50 [02:51<00:00,  3.44s/it]\n",
+            "Batch 3: 100%|██████████| 29/29 [01:52<00:00,  3.89s/it]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"💾 Saving results...\")\n",
+        "\n",
+        "df = pd.DataFrame(results)\n",
+        "df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
+        "    for _, row in df.iterrows():\n",
+        "        f.write(f\"\\n--- Page {row['page']} ---\\n{row['text']}\\n\")\n"
+      ],
+      "metadata": {
+        "id": "wA9oIFuMy3gA",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "397832ee-5722-43aa-d66d-b85e5f1768c2"
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "💾 Saving results...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
+        "stats = {\n",
+        "    \"total_pages\": len(df),\n",
+        "    \"total_words\": df[\"word_count\"].sum(),\n",
+        "    \"avg_words_per_page\": df[\"word_count\"].mean()\n",
+        "}\n",
+        "stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
+        "stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ Done!\")\n",
+        "print(f\"📂 Text file saved: {output_csv}\")\n",
+        "print(f\"📂 Clean text file saved: {output_txt}\")\n",
+        "print(f\"📊 Stats file saved: {output_stats}\")"
+      ],
+      "metadata": {
+        "id": "CPYgXgLMy6Xh",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "785517e6-145c-4ead-a012-353340643741"
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ Done!\n",
+            "📂 Text file saved: output_text.csv\n",
+            "📂 Clean text file saved: output_text.txt\n",
+            "📊 Stats file saved: output_stats.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import re\n",
+        "\n",
+        "def super_clean_text_en(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^a-zA-Z0-9\\s\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\b(\\w+)( \\1){2,}\\b\", r\"\\1\", text)\n",
+        "\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    text = re.sub(r\"(\\d)\\1{2,}\", r\"\\1\\1\\1\", text)\n",
+        "\n",
+        "    words = text.split()\n",
+        "    cleaned_words = []\n",
+        "    for w in words:\n",
+        "        if len(w) >= 15:\n",
+        "            continue\n",
+        "        cleaned_words.append(w)\n",
+        "    text = \" \".join(cleaned_words)\n",
+        "\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "\n",
+        "    lines = text.split(\".\")\n",
+        "    seen = set()\n",
+        "    cleaned_lines = []\n",
+        "    for line in lines:\n",
+        "        l = line.strip()\n",
+        "        if l and l not in seen:\n",
+        "            cleaned_lines.append(l)\n",
+        "            seen.add(l)\n",
+        "    text = \". \".join(cleaned_lines)\n",
+        "\n",
+        "    return text.strip()\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_prime5_en.csv\")\n",
+        "\n",
+        "df[\"text\"] = df[\"text\"].apply(super_clean_text_en)\n",
+        "\n",
+        "\n",
+        "df.to_csv(\"output_text_en_cleaned.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "9f2f030a-f8ab-448f-fa95-b4b18a017fa1",
+        "id": "kko7q5b7VHP5"
+      },
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "# ---------------- structure ----------------\n",
+        "structure = {\n",
+        "    \"Unit 1: Interactions of Organisms\": {\n",
+        "        \"Get Started\": (10, 13),\n",
+        "        \"Concepts\": {\n",
+        "            \"Concept 1.1 Plant Needs\": {\n",
+        "                \"Lesson 1\": (14, 17),\n",
+        "                \"Lesson 2\": (18, 21),\n",
+        "                \"Lesson 3\": (22, 25),\n",
+        "                \"Lesson 4\": (26, 29),\n",
+        "                \"Lesson 5\": (30, 33),\n",
+        "            },\n",
+        "            \"Concept 1.2 Energy Flow in Ecosystems\": {\n",
+        "                \"Lesson 1\": (34, 37),\n",
+        "                \"Lesson 2\": (38, 40),\n",
+        "                \"Lesson 3\": (41, 43),\n",
+        "                \"Lesson 4\": (44, 45),\n",
+        "            },\n",
+        "            \"Concept 1.3 Changes in Food Webs\": {\n",
+        "                \"Lesson 1\": (46, 50),\n",
+        "                \"Lesson 2\": (51, 53),\n",
+        "                \"Lesson 3\": (54, 55),\n",
+        "                \"Lesson 4\": (56, 58),\n",
+        "            }\n",
+        "        },\n",
+        "        \"Unit Project\": (59, 59),\n",
+        "        \"Interdisciplinary project\": (60, 67),\n",
+        "        \"Assess your learning\": (68, 69)\n",
+        "    },\n",
+        "    \"Unit 2: Particles in Motion\": {\n",
+        "        \"Get Started\": (70, 73),\n",
+        "        \"Concepts\": {\n",
+        "            \"Concept 2.1 Matter in the World around Us\": {\n",
+        "                \"Lesson 1\": (74, 76),\n",
+        "                \"Lesson 2\": (77, 80),\n",
+        "                \"Lesson 3\": (81, 83),\n",
+        "                \"Lesson 4\": (84, 86),\n",
+        "                \"Lesson 5\": (87, 89),\n",
+        "            },\n",
+        "            \"Concept 2.2 Describing and Measuring Matter\": {\n",
+        "                \"Lesson 1\": (90, 93),\n",
+        "                \"Lesson 2\": (94, 95),\n",
+        "                \"Lesson 3\": (96, 100),\n",
+        "                \"Lesson 4\": (101, 103),\n",
+        "            },\n",
+        "            \"Concept 2.3 Comparing Changes in Matter\": {\n",
+        "                \"Lesson 1\": (104, 107),\n",
+        "                \"Lesson 2\": (108, 111),\n",
+        "                \"Lesson 3\": (112, 116),\n",
+        "                \"Lesson 4\": (117, 120),\n",
+        "                \"Lesson 5\": (121, 123),\n",
+        "            }\n",
+        "        },\n",
+        "        \"Unit Project\": (124, 125),\n",
+        "        \"Assess your learning\": (126, 127)\n",
+        "    },\n",
+        "}\n",
+        "\n",
+        "# ---------------- load OCR text ----------------\n",
+        "df = pd.read_csv(\"/content/output_text_en_cleaned.csv\")\n",
+        "df_raw = pd.read_csv(\"/content/output_text_prime5_en.csv\")\n",
+        "df[\"page\"] = df_raw[\"page\"]\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "# ---------------- split text ----------------\n",
+        "for unit, udata in structure.items():\n",
+        "    # Get Started\n",
+        "    if \"Get Started\" in udata:\n",
+        "        start, end = udata[\"Get Started\"]\n",
+        "        text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "        rows.append({\n",
+        "            \"Unit\": unit,\n",
+        "            \"Concept\": \"Get Started\",\n",
+        "            \"Lesson\": \"Get Started\",\n",
+        "            \"From page\": start,\n",
+        "            \"To page\": end,\n",
+        "            \"Lesson text\": text\n",
+        "        })\n",
+        "\n",
+        "    # Concepts & Lessons\n",
+        "    if \"Concepts\" in udata:\n",
+        "        for concept, lessons in udata[\"Concepts\"].items():\n",
+        "            for lesson, (ls, le) in lessons.items():\n",
+        "                lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
+        "                rows.append({\n",
+        "                    \"Unit\": unit,\n",
+        "                    \"Concept\": concept,\n",
+        "                    \"Lesson\": lesson,\n",
+        "                    \"From page\": ls,\n",
+        "                    \"To page\": le,\n",
+        "                    \"Lesson text\": lesson_text\n",
+        "                })\n",
+        "\n",
+        "    # Other sections\n",
+        "    for section in [\"Unit Project\", \"Assess your learning\", \"Interdisciplinary project\"]:\n",
+        "        if section in udata:\n",
+        "            start, end = udata[section]\n",
+        "            text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "            rows.append({\n",
+        "                \"Unit\": unit,\n",
+        "                \"Concept\": section,\n",
+        "                \"Lesson\": section,\n",
+        "                \"From page\": start,\n",
+        "                \"To page\": end,\n",
+        "                \"Lesson text\": text\n",
+        "            })\n",
+        "\n",
+        "# ---------------- save output ----------------\n",
+        "df_out = pd.DataFrame(rows)\n",
+        "df_out.to_csv(\"output_units_lessons_prime5_EN.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ Done! Lessons & units split and saved to output_units_lessons_prime5_EN.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pICh8K7rnZyd",
+        "outputId": "cc0d291d-e292-494c-fc15-310f5bb047e7"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ Done! Lessons & units split and saved to output_units_lessons_prime5_EN.csv\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/Data/english/prime5/output_stats_prime5_en.csv
+++ b/Data/english/prime5/output_stats_prime5_en.csv
+metric,value
+total_pages,129.0
+total_words,24317.0
+avg_words_per_page,188.50387596899225
--- a/Data/english/prime5/output_text_en_cleaned.csv
+++ b/Data/english/prime5/output_text_en_cleaned.csv
--- a/Data/english/prime5/output_text_prime5.txt
+++ b/Data/english/prime5/output_text_prime5.txt
--- a/Data/english/prime5/output_text_prime5_en.csv
+++ b/Data/english/prime5/output_text_prime5_en.csv
--- a/Data/english/prime5/output_units_lessons_prime5_EN.csv
+++ b/Data/english/prime5/output_units_lessons_prime5_EN.csv
--- a/Data/english/prime6/english_extract_prime6.ipynb
+++ b/Data/english/prime6/english_extract_prime6.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!apt-get install tesseract-ocr\n",
+        "!pip install pdfplumber pdf2image pytesseract\n",
+        "!apt-get install -y poppler-utils\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mXFZpTwLOjTZ",
+        "outputId": "7bea32f6-f53f-4946-c7ba-31b3b03059b7"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "tesseract-ocr is already the newest version (4.1.1-2.1build1).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n",
+            "Requirement already satisfied: pdfplumber in /usr/local/lib/python3.12/dist-packages (0.11.7)\n",
+            "Requirement already satisfied: pdf2image in /usr/local/lib/python3.12/dist-packages (1.17.0)\n",
+            "Requirement already satisfied: pytesseract in /usr/local/lib/python3.12/dist-packages (0.3.13)\n",
+            "Requirement already satisfied: pdfminer.six==20250506 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (20250506)\n",
+            "Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
+            "Requirement already satisfied: pypdfium2>=4.18.0 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (4.30.0)\n",
+            "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.3)\n",
+            "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
+            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.12/dist-packages (from pytesseract) (25.0)\n",
+            "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (1.17.1)\n",
+            "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.22)\n",
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "poppler-utils is already the newest version (22.02.0-2ubuntu0.10).\n",
+            "0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import cv2\n",
+        "import pytesseract\n",
+        "import pandas as pd\n",
+        "from pdf2image import convert_from_path\n",
+        "from tqdm import tqdm\n",
+        "import re\n"
+      ],
+      "metadata": {
+        "id": "OlXc31-OPKrF"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pdf_path = \"/content/Science_E_prim6_TR1.pdf\"\n",
+        "output_csv = \"output_text_prime6_en.csv\"\n",
+        "output_stats = \"output_stats_prime6_en.csv\"\n",
+        "output_txt = \"output_text_prime6_en.txt\"\n",
+        "images_dir = \"pdf_images_prime6_en\"\n",
+        "batch_size = 50\n",
+        "low_word_threshold = 10\n",
+        "\n",
+        "os.makedirs(images_dir, exist_ok=True)"
+      ],
+      "metadata": {
+        "id": "u1W69-mMyhMG"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"📄 Converting PDF to images...\")\n",
+        "pages = convert_from_path(pdf_path, dpi=300)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "2b682e60-e7e8-42de-f4c8-89bbb113cc45",
+        "id": "C_LneoyryryP"
+      },
+      "execution_count": 7,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "📄 Converting PDF to images...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "image_paths = []\n",
+        "for i, page in enumerate(pages, start=1):\n",
+        "    img_path = os.path.join(images_dir, f\"page_{i}.png\")\n",
+        "    page.save(img_path, \"PNG\")\n",
+        "    image_paths.append(img_path)\n"
+      ],
+      "metadata": {
+        "id": "wgX68ISuyvIn"
+      },
+      "execution_count": 8,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def clean_text(text):\n",
+        "    text = str(text)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "    text = re.sub(r\"[^\\x00-\\x7F\\u0600-\\u06FF]+\", \" \", text)\n",
+        "    return text.strip()"
+      ],
+      "metadata": {
+        "id": "IRBX4I2bhZng"
+      },
+      "execution_count": 9,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"🔎 Running OCR on pages... (Batch mode)\")\n",
+        "results = []\n",
+        "\n",
+        "for batch_start in range(0, len(image_paths), batch_size):\n",
+        "    batch_paths = image_paths[batch_start:batch_start+batch_size]\n",
+        "    for i, img_path in enumerate(tqdm(batch_paths, desc=f\"Batch {batch_start//batch_size+1}\")):\n",
+        "        page_number = batch_start + i + 1\n",
+        "\n",
+        "        img = cv2.imread(img_path)\n",
+        "\n",
+        "        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+        "\n",
+        "        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\n",
+        "\n",
+        "        text = pytesseract.image_to_string(thresh, lang=\"eng\")\n",
+        "\n",
+        "        cleaned_text = clean_text(text)\n",
+        "\n",
+        "        results.append({\"page\": page_number, \"text\": cleaned_text})\n"
+      ],
+      "metadata": {
+        "id": "ZAwUNKFkyz7m",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "3625b751-cd0e-426a-cb12-f9495058d18c"
+      },
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "🔎 Running OCR on pages... (Batch mode)\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "Batch 1: 100%|██████████| 50/50 [03:36<00:00,  4.34s/it]\n",
+            "Batch 2: 100%|██████████| 50/50 [03:43<00:00,  4.47s/it]\n",
+            "Batch 3: 100%|██████████| 38/38 [02:54<00:00,  4.60s/it]\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(\"💾 Saving results...\")\n",
+        "\n",
+        "df = pd.DataFrame(results)\n",
+        "df.to_csv(output_csv, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "with open(output_txt, \"w\", encoding=\"utf-8\") as f:\n",
+        "    for _, row in df.iterrows():\n",
+        "        f.write(f\"\\n--- Page {row['page']} ---\\n{row['text']}\\n\")\n"
+      ],
+      "metadata": {
+        "id": "wA9oIFuMy3gA",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "9a3bbab0-f1a0-4b82-b8f8-c6d3ce765eac"
+      },
+      "execution_count": 11,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "💾 Saving results...\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "df[\"word_count\"] = df[\"text\"].apply(lambda x: len(x.split()))\n",
+        "stats = {\n",
+        "    \"total_pages\": len(df),\n",
+        "    \"total_words\": df[\"word_count\"].sum(),\n",
+        "    \"avg_words_per_page\": df[\"word_count\"].mean()\n",
+        "}\n",
+        "stats_df = pd.DataFrame(list(stats.items()), columns=[\"metric\", \"value\"])\n",
+        "stats_df.to_csv(output_stats, index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ Done!\")\n",
+        "print(f\"📂 Text file saved: {output_csv}\")\n",
+        "print(f\"📂 Clean text file saved: {output_txt}\")\n",
+        "print(f\"📊 Stats file saved: {output_stats}\")"
+      ],
+      "metadata": {
+        "id": "CPYgXgLMy6Xh",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "ddd6ce3b-e481-4c9d-f33a-f485ebbd7d27"
+      },
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ Done!\n",
+            "📂 Text file saved: output_text_prime6_en.csv\n",
+            "📂 Clean text file saved: output_text_prime6_en.txt\n",
+            "📊 Stats file saved: output_stats_prime6_en.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "import re\n",
+        "\n",
+        "def super_clean_text_en(text):\n",
+        "    text = str(text)\n",
+        "\n",
+        "    text = re.sub(r\"[^a-zA-Z0-9\\s\\.\\,\\?\\!\\:\\;\\-\\(\\)\\/%]\", \" \", text)\n",
+        "\n",
+        "    text = re.sub(r\"\\b(\\w+)( \\1){2,}\\b\", r\"\\1\", text)\n",
+        "\n",
+        "    text = re.sub(r\"(.)\\1{2,}\", r\"\\1\\1\", text)\n",
+        "\n",
+        "    text = re.sub(r\"(\\d)\\1{2,}\", r\"\\1\\1\\1\", text)\n",
+        "\n",
+        "    words = text.split()\n",
+        "    cleaned_words = []\n",
+        "    for w in words:\n",
+        "        if len(w) >= 15:\n",
+        "            continue\n",
+        "        cleaned_words.append(w)\n",
+        "    text = \" \".join(cleaned_words)\n",
+        "\n",
+        "    text = re.sub(r\"\\s+\", \" \", text)\n",
+        "\n",
+        "    lines = text.split(\".\")\n",
+        "    seen = set()\n",
+        "    cleaned_lines = []\n",
+        "    for line in lines:\n",
+        "        l = line.strip()\n",
+        "        if l and l not in seen:\n",
+        "            cleaned_lines.append(l)\n",
+        "            seen.add(l)\n",
+        "    text = \". \".join(cleaned_lines)\n",
+        "\n",
+        "    return text.strip()\n",
+        "\n",
+        "df = pd.read_csv(\"/content/output_text_prime6_en.csv\")\n",
+        "\n",
+        "df[\"text\"] = df[\"text\"].apply(super_clean_text_en)\n",
+        "\n",
+        "df.to_csv(\"output_text_en_cleaned.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "37b3fbf7-f5dc-4d71-b09a-c4c5326a4321",
+        "id": "kko7q5b7VHP5"
+      },
+      "execution_count": 14,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ النصوص اتنضفت وحُفظت في output_text_en_cleaned_prime5.csv\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "# ---------------- structure ----------------\n",
+        "structure = {\n",
+        "    \"Unit 1: What is system\": {\n",
+        "        \"Get Started\": (9, 12),\n",
+        "        \"Concepts\": {\n",
+        "            \"Concept 1.1 The cell as a system\": {\n",
+        "                \"Lesson 1\": (13, 17),\n",
+        "                \"Lesson 2\": (18, 20),\n",
+        "                \"Lesson 3\": (21, 23),\n",
+        "                \"Lesson 4\": (24, 27),\n",
+        "                \"Lesson 5\": (28, 30),\n",
+        "                \"Lesson 6\": (31, 34),\n",
+        "            },\n",
+        "            \"Concept 1.2 The body as a system\": {\n",
+        "                \"Lesson 1\": (35, 38),\n",
+        "                \"Lesson 2\": (39, 41),\n",
+        "                \"Lesson 3\": (42, 45),\n",
+        "                \"Lesson 4\": (46, 48),\n",
+        "                \"Lesson 5\": (49, 51),\n",
+        "                \"Lesson 6\": (52, 54),\n",
+        "            },\n",
+        "            \"Concept 1.3 Energy as a system\": {\n",
+        "                \"Lesson 1\": (55, 57),\n",
+        "                \"Lesson 2\": (58, 60),\n",
+        "                \"Lesson 3\": (61, 65),\n",
+        "                \"Lesson 4\": (66, 67),\n",
+        "                \"Lesson 5\": (68, 71),\n",
+        "                \"Lesson 6\": (72, 73),\n",
+        "            }\n",
+        "        },\n",
+        "        \"Unit Project\": (74, 76),\n",
+        "        \"Unit assessment\": (77, 80),\n",
+        "    },\n",
+        "    \"Unit 2: Getting energy\": {\n",
+        "        \"Get Started\": (81, 84),\n",
+        "        \"Concepts\": {\n",
+        "            \"Concept 2.1 Thermal energy and states of matter\": {\n",
+        "                \"Lesson 1\": (85, 88),\n",
+        "                \"Lesson 2\": (89, 91),\n",
+        "                \"Lesson 3\": (92, 93),\n",
+        "                \"Lesson 4\": (94, 95),\n",
+        "                \"Lesson 5\": (96, 98),\n",
+        "                \"Lesson 6\": (99, 100),\n",
+        "            },\n",
+        "            \"Concept 2.2 Heat transfer\": {\n",
+        "                \"Lesson 1\": (101, 104),\n",
+        "                \"Lesson 2\": (105, 108),\n",
+        "                \"Lesson 3\": (109, 110),\n",
+        "                \"Lesson 4\": (111, 113),\n",
+        "                \"Lesson 5\": (114, 116),\n",
+        "                \"Lesson 6\": (117, 120),\n",
+        "            },\n",
+        "        },\n",
+        "        \"Unit Project\": (121, 122),\n",
+        "        \"Interdisciplinary project\": (123, 131),\n",
+        "        \"Unit assessment\": (132, 135)   # توحيد الاسم\n",
+        "    },\n",
+        "}\n",
+        "\n",
+        "# ---------------- load OCR text ----------------\n",
+        "df = pd.read_csv(\"/content/output_text_en_cleaned.csv\")\n",
+        "df_raw = pd.read_csv(\"/content/output_text_prime6_en.csv\")\n",
+        "df[\"page\"] = df_raw[\"page\"]\n",
+        "\n",
+        "rows = []\n",
+        "\n",
+        "# ---------------- split text ----------------\n",
+        "for unit, udata in structure.items():\n",
+        "    # Get Started\n",
+        "    if \"Get Started\" in udata:\n",
+        "        start, end = udata[\"Get Started\"]\n",
+        "        text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "        rows.append({\n",
+        "            \"Unit\": unit,\n",
+        "            \"Concept\": \"Get Started\",\n",
+        "            \"Lesson\": \"Get Started\",\n",
+        "            \"From page\": start,\n",
+        "            \"To page\": end,\n",
+        "            \"Lesson text\": text\n",
+        "        })\n",
+        "\n",
+        "    # Concepts & Lessons\n",
+        "    if \"Concepts\" in udata:\n",
+        "        for concept, lessons in udata[\"Concepts\"].items():\n",
+        "            for lesson, (ls, le) in lessons.items():\n",
+        "                lesson_text = \" \".join(df[(df[\"page\"] >= ls) & (df[\"page\"] <= le)][\"text\"])\n",
+        "                rows.append({\n",
+        "                    \"Unit\": unit,\n",
+        "                    \"Concept\": concept,\n",
+        "                    \"Lesson\": lesson,\n",
+        "                    \"From page\": ls,\n",
+        "                    \"To page\": le,\n",
+        "                    \"Lesson text\": lesson_text\n",
+        "                })\n",
+        "\n",
+        "    # Other sections\n",
+        "    for section in [\"Unit Project\", \"Unit assessment\", \"Interdisciplinary project\"]:\n",
+        "        if section in udata:\n",
+        "            start, end = udata[section]\n",
+        "            text = \" \".join(df[(df[\"page\"] >= start) & (df[\"page\"] <= end)][\"text\"])\n",
+        "            rows.append({\n",
+        "                \"Unit\": unit,\n",
+        "                \"Concept\": section,\n",
+        "                \"Lesson\": section,\n",
+        "                \"From page\": start,\n",
+        "                \"To page\": end,\n",
+        "                \"Lesson text\": text\n",
+        "            })\n",
+        "\n",
+        "# ---------------- save output ----------------\n",
+        "df_out = pd.DataFrame(rows)\n",
+        "df_out.to_csv(\"output_units_lessons_prime6_EN.csv\", index=False, encoding=\"utf-8-sig\")\n",
+        "\n",
+        "print(\"✅ Done! Lessons & units split and saved to output_units_lessons_prime6_EN.csv\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pICh8K7rnZyd",
+        "outputId": "22525901-3cc3-4103-b889-e9c4410fd46d"
+      },
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "✅ Done! Lessons & units split and saved to output_units_lessons_prime6_EN.csv\n"
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
--- a/Data/english/prime6/output_stats_prime6_en.csv
+++ b/Data/english/prime6/output_stats_prime6_en.csv
+metric,value
+total_pages,138.0
+total_words,28455.0
+avg_words_per_page,206.19565217391303
--- a/Data/english/prime6/output_text_en_cleaned.csv
+++ b/Data/english/prime6/output_text_en_cleaned.csv
--- a/Data/english/prime6/output_text_prime6_en.csv
+++ b/Data/english/prime6/output_text_prime6_en.csv
--- a/Data/english/prime6/output_text_prime6_en.txt
+++ b/Data/english/prime6/output_text_prime6_en.txt
--- a/Data/english/prime6/output_units_lessons_prime6_EN.csv
+++ b/Data/english/prime6/output_units_lessons_prime6_EN.csv