From 25a72ce18b41e2d98b820c473514226c1cd90a99 Mon Sep 17 00:00:00 2001 From: Hasan_alhomsi Date: Wed, 4 Mar 2026 12:44:55 +0000 Subject: [PATCH] first commit --- clean_arabic_spacy git.ipynb | 808 +++++++++++++++++++++++++++++++++++ 1 file changed, 808 insertions(+) create mode 100644 clean_arabic_spacy git.ipynb diff --git a/clean_arabic_spacy git.ipynb b/clean_arabic_spacy git.ipynb new file mode 100644 index 0000000..725870d --- /dev/null +++ b/clean_arabic_spacy git.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1b2c3d4", + "metadata": { + "id": "a1b2c3d4" + }, + "source": [ + "# Arabic Text Cleaner — spaCy Edition\n", + "\n", + "Cleans raw HTML scraped from **moad.gov.sy** into clean Arabic text using a custom spaCy pipeline.\n", + "\n", + "### Pipeline steps\n", + "1. **HTML stripping** — BeautifulSoup removes scripts, styles, and all tags\n", + "2. **`arabic_normaliser`** — removes diacritics, normalises Alef variants, strips tatweel\n", + "3. **`arabic_cleaner`** — removes URLs, punctuation-only tokens, lone symbols, whitespace tokens\n", + "4. **Morphologizer** (built into `ar_core_news_sm`) — lemmas & morphological features\n", + "5. **Post-processing** — collapses symbol noise and extra blank lines\n", + "\n", + "### Requirements\n", + "```bash\n", + "pip install spacy beautifulsoup4 lxml\n", + "python -m spacy download ar_core_news_sm\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "b2c3d4e5", + "metadata": { + "id": "b2c3d4e5" + }, + "source": [ + "## 1 · Install dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3d4e5f6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c3d4e5f6", + "outputId": "d90f84f7-0f44-44b0-8103-ddfcfc249240" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: spacy in /usr/local/lib/python3.12/dist-packages (3.8.11)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n", + "Requirement already satisfied: lxml in /usr/local/lib/python3.12/dist-packages (6.0.2)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.12/dist-packages (from spacy) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (1.0.15)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.0.13)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.12/dist-packages (from spacy) (3.0.12)\n", + "Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /usr/local/lib/python3.12/dist-packages (from spacy) (8.3.10)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.12/dist-packages (from spacy) (1.1.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.5.2)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.0.10)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from spacy) (0.4.3)\n", + "Requirement already satisfied: typer-slim<1.0.0,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (0.24.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (4.67.3)\n", + "Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.0.2)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.32.4)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.12/dist-packages (from spacy) (2.12.3)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from spacy) (3.1.6)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from spacy) (75.2.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from spacy) (26.0)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.41.4)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.4.2)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4.4)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2026.2.25)\n", + "Requirement already satisfied: blis<1.4.0,>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from thinc<8.4.0,>=8.3.4->spacy) (1.3.3)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.12/dist-packages (from thinc<8.4.0,>=8.3.4->spacy) (0.1.5)\n", + "Requirement already satisfied: typer>=0.24.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim<1.0.0,>=0.3.0->spacy) (0.24.1)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from weasel<0.5.0,>=0.4.2->spacy) (0.23.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.12/dist-packages (from weasel<0.5.0,>=0.4.2->spacy) (7.5.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->spacy) (3.0.3)\n", + "Requirement already satisfied: wrapt in /usr/local/lib/python3.12/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.4.2->spacy) (2.1.1)\n", + "Requirement already satisfied: click>=8.2.1 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (8.3.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (1.5.4)\n", + "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (13.9.4)\n", + "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (0.0.4)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (2.19.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer>=0.24.0->typer-slim<1.0.0,>=0.3.0->spacy) (0.1.2)\n", + "\n", + "\u001b[38;5;1m✘ No compatible package found for 'ar_core_news_sm' (spaCy v3.8.11)\u001b[0m\n", + "\n" + ] + } + ], + "source": [ + "\n", + "!pip install spacy beautifulsoup4 lxml\n", + "!python -m spacy download ar_core_news_sm" + ] + }, + { + "cell_type": "markdown", + "id": "d4e5f6a7", + "metadata": { + "id": "d4e5f6a7" + }, + "source": [ + "## 2 · Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e5f6a7b8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e5f6a7b8", + "outputId": "8c862123-5a73-4cc5-ba21-f6321694d6b5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "spaCy version: 3.8.11\n" + ] + } + ], + "source": [ + "import json\n", + "import re\n", + "from pathlib import Path\n", + "\n", + "from bs4 import BeautifulSoup\n", + "import spacy\n", + "from spacy.language import Language\n", + "from spacy.tokens import Doc\n", + "\n", + "print(f\"spaCy version: {spacy.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f6a7b8c9", + "metadata": { + "id": "f6a7b8c9" + }, + "source": [ + "## 3 · Config — set your file paths here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7b8c9d0", + "metadata": { + "id": "a7b8c9d0" + }, + "outputs": [], + "source": [ + "INPUT_PATH = \"/content/moad_full_scrape.json\" \n", + "OUTPUT_PATH = \"/content/moad_cleaned_spacy.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "b8c9d0e1", + "metadata": { + "id": "b8c9d0e1" + }, + "source": [ + "## 4 · Arabic Unicode patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9d0e1f2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c9d0e1f2", + "outputId": "a85c6ef2-de7c-408b-e3d8-cb61c63f32d8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Patterns compiled ✓\n" + ] + } + ], + "source": [ + "\n", + "DIACRITICS = re.compile(r'[\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED]')\n", + "\n", + "\n", + "ALEF_VARIANTS = re.compile(r'[أإآٱ]')\n", + "\n", + "\n", + "TATWEEL = re.compile(r'\\u0640')\n", + "\n", + "\n", + "SYMBOL_NOISE = re.compile(r'([^\\w\\u0600-\\u06FF])\\1{2,}')\n", + "\n", + "\n", + "MULTI_BLANK = re.compile(r'\\n{3,}')\n", + "\n", + "print(\"Patterns compiled ✓\")" + ] + }, + { + "cell_type": "markdown", + "id": "d0e1f2a3", + "metadata": { + "id": "d0e1f2a3" + }, + "source": [ + "## 5 · Custom spaCy pipeline components" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1f2a3b4", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e1f2a3b4", + "outputId": "5ad2c0d6-d97b-4672-89b0-e6ddf2014b03" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Custom components registered ✓\n" + ] + } + ], + "source": [ + "\n", + "if not spacy.tokens.Token.has_extension(\"keep\"):\n", + " spacy.tokens.Token.set_extension(\"keep\", default=True)\n", + "\n", + "\n", + "@Language.component(\"arabic_normaliser\")\n", + "def arabic_normaliser(doc: Doc) -> Doc:\n", + " \"\"\"\n", + " Normalises each token after morphological analysis:\n", + " - Removes diacritics (tashkeel / harakat)\n", + " - Normalises Alef variants (أإآٱ → ا)\n", + " - Removes tatweel (kashida ـ)\n", + " Result stored in token.norm_ (token.text is untouched).\n", + " \"\"\"\n", + " for token in doc:\n", + " norm = token.text\n", + " norm = DIACRITICS.sub('', norm)\n", + " norm = ALEF_VARIANTS.sub('ا', norm)\n", + " norm = TATWEEL.sub('', norm)\n", + " token.norm_ = norm\n", + " return doc\n", + "\n", + "\n", + "@Language.component(\"arabic_cleaner\")\n", + "def arabic_cleaner(doc: Doc) -> Doc:\n", + " \"\"\"\n", + " Flags tokens for removal via token._.keep:\n", + " - URLs and email addresses\n", + " - Pure whitespace tokens\n", + " - Punctuation-only tokens (non-Arabic)\n", + " - Single non-alphanumeric, non-Arabic characters\n", + " \"\"\"\n", + " url_re = re.compile(r'https?://\\S+|www\\.\\S+|\\S+@\\S+\\.\\S+')\n", + " arabic_re = re.compile(r'[\\u0600-\\u06FF]')\n", + "\n", + " for token in doc:\n", + " text = token.norm_ or token.text\n", + " if url_re.match(text):\n", + " token._.keep = False\n", + " elif token.is_space:\n", + " token._.keep = False\n", + " elif token.is_punct and not arabic_re.search(text):\n", + " token._.keep = False\n", + " elif len(text) == 1 and not text.isalnum() and not arabic_re.match(text):\n", + " token._.keep = False\n", + " else:\n", + " token._.keep = True\n", + " return doc\n", + "\n", + "\n", + "print(\"Custom components registered ✓\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rMv5CyFrulah", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rMv5CyFrulah", + "outputId": "15749c95-0894-43ed-eeb9-60d3c7b9d2e0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting xx-sent-ud-sm==3.8.0\n", + " Using cached https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('xx_sent_ud_sm')\n", + "\u001b[38;5;3m⚠ Restart to reload dependencies\u001b[0m\n", + "If you are in a Jupyter or Colab notebook, you may need to restart Python in\n", + "order to load all the package's dependencies. You can do this by selecting the\n", + "'Restart kernel' or 'Restart runtime' option.\n" + ] + } + ], + "source": [ + "!python -m spacy download xx_sent_ud_sm\n" + ] + }, + { + "cell_type": "markdown", + "id": "f2a3b4c5", + "metadata": { + "id": "f2a3b4c5" + }, + "source": [ + "## 6 · Build the spaCy pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6YUkvUHUv97k", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6YUkvUHUv97k", + "outputId": "9b210d2b-50ad-4574-d121-7750fd382686" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipeline: ['arabic_normaliser', 'senter', 'arabic_cleaner']\n" + ] + } + ], + "source": [ + "import spacy\n", + "from spacy.language import Language\n", + "\n", + "\n", + "\n", + "def build_nlp() -> Language:\n", + " \n", + " try:\n", + " nlp = spacy.load(\"xx_sent_ud_sm\", disable=[\"parser\", \"ner\"])\n", + " except OSError:\n", + " \n", + " nlp = spacy.blank(\"ar\")\n", + "\n", + " \n", + " if \"morphologizer\" in nlp.pipe_names:\n", + " nlp.add_pipe(\"arabic_normaliser\", after=\"morphologizer\")\n", + " else:\n", + " \n", + " nlp.add_pipe(\"arabic_normaliser\", first=True)\n", + "\n", + " \n", + " nlp.add_pipe(\"arabic_cleaner\", last=True)\n", + "\n", + " return nlp\n", + "\n", + "\n", + "nlp = build_nlp()\n", + "print(\"Pipeline:\", nlp.pipe_names)" + ] + }, + { + "cell_type": "markdown", + "id": "b4c5d6e7", + "metadata": { + "id": "b4c5d6e7" + }, + "source": [ + "## 7 · Helper functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5d6e7f8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c5d6e7f8", + "outputId": "323dff30-aff5-4a42-cdd6-581df0144724" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Helper functions defined ✓\n" + ] + } + ], + "source": [ + "def extract_text_from_html(html: str) -> str:\n", + " \"\"\"Strip all HTML tags and return visible text.\"\"\"\n", + " soup = BeautifulSoup(html, \"lxml\")\n", + " for tag in soup.find_all([\"script\", \"style\", \"noscript\", \"head\",\n", + " \"meta\", \"link\", \"iframe\", \"svg\", \"img\"]):\n", + " tag.decompose()\n", + " return soup.get_text(separator=\"\\n\")\n", + "\n", + "\n", + "def post_clean(text: str) -> str:\n", + " \n", + " text = SYMBOL_NOISE.sub(r'\\1', text)\n", + "\n", + " \n", + " lines = [line.strip() for line in text.splitlines()]\n", + "\n", + " \n", + " \n", + " text = ' '.join(lines)\n", + "\n", + " \n", + " \n", + " \n", + " text = re.sub(r'\\s+', ' ', text)\n", + "\n", + " return text.strip()\n", + "\n", + "def process_record(record: dict, nlp: Language) -> dict:\n", + " \n", + " raw_text = extract_text_from_html(record.get(\"Text\", \" \"))\n", + "\n", + " \n", + " doc = nlp(raw_text)\n", + "\n", + " \n", + " parts = []\n", + " for token in doc:\n", + " if token._.keep:\n", + " \n", + " text_to_append = token.norm_ or token.text\n", + " parts.append(text_to_append)\n", + "\n", + " \n", + " cleaned = \" \".join(parts)\n", + "\n", + " \n", + " \n", + " cleaned = post_clean(cleaned)\n", + "\n", + " return {\n", + " \"Url\": record.get(\"Url\", \"\"),\n", + " \"Timestamp\": record.get(\"Timestamp\", \"\"),\n", + " \"Text\": cleaned,\n", + " \"token_count\": sum(1 for t in doc if t._.keep),\n", + " }\n", + "\n", + "\n", + "print(\"Helper functions defined ✓\")" + ] + }, + { + "cell_type": "markdown", + "id": "d6e7f8a9", + "metadata": { + "id": "d6e7f8a9" + }, + "source": [ + "## 8 · Load the JSON file" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e7f8a9b0", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e7f8a9b0", + "outputId": "1207bc14-3a07-43bb-c664-8222befc9761" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 26 records from /content/moad_full_scrape.json\n", + "Keys per record: ['Url', 'Text', 'Timestamp']\n" + ] + } + ], + "source": [ + "input_path = Path(INPUT_PATH)\n", + "\n", + "with open(input_path, encoding=\"utf-8\") as f:\n", + " data = json.load(f)\n", + "\n", + "print(f\"Loaded {len(data)} records from {input_path}\")\n", + "print(f\"Keys per record: {list(data[0].keys())}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f8a9b0c1", + "metadata": { + "id": "f8a9b0c1" + }, + "source": [ + "## 9 · Preview one raw record" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a9b0c1d2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a9b0c1d2", + "outputId": "28cefa91-bccb-48bb-dc7b-12f3cc22da97" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "URL : https://moad.gov.sy/ar/contact/important-links\n", + "Timestamp: 2026-03-04T11:45:50.322876\n", + "\n", + "Raw HTML (first 500 chars):\n", + "