Merge pull request 'رفع الملفات إلى "arabic (simple scraper) "' (#1 ) from asaad/ArabicNLPResources:main into main

Reviewed-on: #1
رفع الملفات إلى "arabic (simple scraper) "
2026-03-10 02:22:19 +00:00 · 2026-03-09 12:03:46 +00:00
--- a//arabic.py
+++ b//arabic.py
@@ -0,0 +1,68 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import time
 import json
 #start------------------------------------------------------------------------------------
 start_url = "https://dgam.gov.sy/?page_id=9606" 
 headers = {
    "User-Agent": "Mozilla/5.0"
 }
 visited = set()
 to_visit = [start_url]
 results = []
 #claeaning-------------------------------------------------------------------------------
 def clean_text(text):
    return " ".join(text.split())
 #crawler-------------------------------------------------------------------------------------
 while to_visit:
    url = to_visit.pop(0)
    if url in visited:
        continue
    print("VISITING:", url)
    try:
        response = requests.get(url, headers=headers, timeout=10)
    except Exception as e:
        print("Error:", e)
        continue
    soup = BeautifulSoup(response.text, "html.parser")
 # text-------------------------------------
    paragraphs = soup.find_all("p")
    text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10)
    if text:
        results.append({
            "url": url,
            "content": text
        })
 #links-----------------------------------------------------
    for link in soup.find_all("a", href=True):
        new_url = urljoin(url, link["href"])
        # نتحقق أن الرابط ضمن نفس الدومين
        if urlparse(new_url).netloc == urlparse(start_url).netloc:
            if new_url not in visited and new_url not in to_visit:
                to_visit.append(new_url)
    visited.add(url)
    time.sleep(1)
 #json---------------------------------------------------------------
 with open("site_full_data.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
 print("SCRAPING FINISHED. Total pages:", len(results))
--- a//site_full_data.json
+++ b//site_full_data.json
المؤلف	SHA1	الرسالة	التاريخ
FadiKalash	44ade8ccb4	Merge pull request 'رفع الملفات إلى "arabic (simple scraper) "' (#1 ) from asaad/ArabicNLPResources:main into main Reviewed-on: #1	2026-03-10 02:22:19 +00:00
asaad	5363fcb3e8	رفع الملفات إلى "arabic (simple scraper) "	2026-03-09 12:03:46 +00:00