رفع الملفات إلى "arabic (simple scraper) "

هذا الالتزام موجود في:
2026-03-09 11:45:17 +00:00
التزام 0f9dfbf1e1
2 ملفات معدلة مع 1923 إضافات و0 حذوفات

عرض الملف

@@ -0,0 +1,68 @@
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
#start------------------------------------------------------------------------------------
start_url = "https://dgam.gov.sy/?page_id=9606"
headers = {
"User-Agent": "Mozilla/5.0"
}
visited = set()
to_visit = [start_url]
results = []
#claeaning-------------------------------------------------------------------------------
def clean_text(text):
return " ".join(text.split())
#crawler-------------------------------------------------------------------------------------
while to_visit:
url = to_visit.pop(0)
if url in visited:
continue
print("VISITING:", url)
try:
response = requests.get(url, headers=headers, timeout=10)
except Exception as e:
print("Error:", e)
continue
soup = BeautifulSoup(response.text, "html.parser")
# text-------------------------------------
paragraphs = soup.find_all("p")
text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10)
if text:
results.append({
"url": url,
"content": text
})
#links-----------------------------------------------------
for link in soup.find_all("a", href=True):
new_url = urljoin(url, link["href"])
# نتحقق أن الرابط ضمن نفس الدومين
if urlparse(new_url).netloc == urlparse(start_url).netloc:
if new_url not in visited and new_url not in to_visit:
to_visit.append(new_url)
visited.add(url)
time.sleep(1)
#json---------------------------------------------------------------
with open("site_full_data.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
print("SCRAPING FINISHED. Total pages:", len(results))

تم حذف اختلاف الملف لأن أحد الأسطر أو أكثر طويلة جداً