import json import re import time from collections import deque from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup START_URL = "https://sana.sy" MAX_PAGES = 10000 REQUEST_DELAY_SECONDS = 0.4 TIMEOUT_SECONDS = 15 OUTPUT_FILE = "site_data2.json" VERIFY_SSL = False def is_same_domain(url: str, base_netloc: str) -> bool: return urlparse(url).netloc == base_netloc def normalize_url(url: str) -> str: parsed = urlparse(url) clean = parsed._replace(fragment="", query="") return clean.geturl().rstrip("/") def extract_text(soup: BeautifulSoup) -> str: for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(separator=" ", strip=True) return re.sub(r"\s+", " ", text).strip() def crawl_site(start_url: str, max_pages: int = 100): session = requests.Session() session.headers.update( { "User-Agent": "Mozilla/5.0 (compatible; simple-crawler/1.0; +https://example.com/bot)" } ) if not VERIFY_SSL: requests.packages.urllib3.disable_warnings() base_netloc = urlparse(start_url).netloc queue = deque([normalize_url(start_url)]) visited = set() results = [] while queue and len(visited) < max_pages: current_url = queue.popleft() if current_url in visited: continue try: response = session.get( current_url, timeout=TIMEOUT_SECONDS, verify=VERIFY_SSL, ) response.raise_for_status() except requests.RequestException as exc: print(f"[SKIP] {current_url} -> {exc}") visited.add(current_url) continue content_type = response.headers.get("Content-Type", "") if "text/html" not in content_type: visited.add(current_url) continue soup = BeautifulSoup(response.text, "html.parser") page_title = soup.title.get_text(strip=True) if soup.title else "" page_text = extract_text(soup) results.append( { "site_url":START_URL, "url": current_url, "title": page_title, "text": page_text, } ) visited.add(current_url) print(f"[OK] ({len(visited)}/{max_pages}) {current_url}") for a_tag in soup.find_all("a", href=True): href = a_tag["href"].strip() absolute_url = normalize_url(urljoin(current_url, href)) parsed = urlparse(absolute_url) if parsed.scheme not in {"http", "https"}: continue if not is_same_domain(absolute_url, base_netloc): continue if absolute_url not in visited: queue.append(absolute_url) time.sleep(REQUEST_DELAY_SECONDS) return results if __name__ == "__main__": data = crawl_site(START_URL, MAX_PAGES) with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"\nSaved {len(data)} pages to {OUTPUT_FILE}")