Merge pull request 'رفع الملفات إلى "arabic (simple scraper) "' (#1) from asaad/ArabicNLPResources:main into main

Reviewed-on: #1
2026-03-10 02:22:19 +00:00
--- a//arabic.py
+++ b//arabic.py
@@ -0,0 +1,68 @@
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import time
+import json
+
+#start------------------------------------------------------------------------------------
+
+start_url = "https://dgam.gov.sy/?page_id=9606" 
+
+headers = {
+    "User-Agent": "Mozilla/5.0"
+}
+
+visited = set()
+to_visit = [start_url]
+results = []
+
+#claeaning-------------------------------------------------------------------------------
+def clean_text(text):
+   
+    return " ".join(text.split())
+
+#crawler-------------------------------------------------------------------------------------
+while to_visit:
+    url = to_visit.pop(0)
+
+    if url in visited:
+        continue
+
+    print("VISITING:", url)
+
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+    except Exception as e:
+        print("Error:", e)
+        continue
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+# text-------------------------------------
+    paragraphs = soup.find_all("p")
+    text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10)
+
+    if text:
+        results.append({
+            "url": url,
+            "content": text
+        })
+
+#links-----------------------------------------------------
+    for link in soup.find_all("a", href=True):
+        new_url = urljoin(url, link["href"])
+        # نتحقق أن الرابط ضمن نفس الدومين
+        if urlparse(new_url).netloc == urlparse(start_url).netloc:
+            if new_url not in visited and new_url not in to_visit:
+                to_visit.append(new_url)
+
+    visited.add(url)
+
+
+    time.sleep(1)
+
+#json---------------------------------------------------------------
+with open("site_full_data.json", "w", encoding="utf-8") as f:
+    json.dump(results, f, ensure_ascii=False, indent=4)
+
+print("SCRAPING FINISHED. Total pages:", len(results))
--- a//site_full_data.json
+++ b//site_full_data.json