import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import time import json #start------------------------------------------------------------------------------------ start_url = "https://dgam.gov.sy/?page_id=9606" headers = { "User-Agent": "Mozilla/5.0" } visited = set() to_visit = [start_url] results = [] #claeaning------------------------------------------------------------------------------- def clean_text(text): return " ".join(text.split()) #crawler------------------------------------------------------------------------------------- while to_visit: url = to_visit.pop(0) if url in visited: continue print("VISITING:", url) try: response = requests.get(url, headers=headers, timeout=10) except Exception as e: print("Error:", e) continue soup = BeautifulSoup(response.text, "html.parser") # text------------------------------------- paragraphs = soup.find_all("p") text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10) if text: results.append({ "url": url, "content": text }) #links----------------------------------------------------- for link in soup.find_all("a", href=True): new_url = urljoin(url, link["href"]) # نتحقق أن الرابط ضمن نفس الدومين if urlparse(new_url).netloc == urlparse(start_url).netloc: if new_url not in visited and new_url not in to_visit: to_visit.append(new_url) visited.add(url) time.sleep(1) #json--------------------------------------------------------------- with open("site_full_data.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=4) print("SCRAPING FINISHED. Total pages:", len(results))