Add boutmoun12.py and clean_data_spacy.py

2026-03-09 15:25:02 +03:00
--- a/boutmoun12.py
+++ b/boutmoun12.py
@@ -0,0 +1,109 @@
+import json
+import re
+import time
+from collections import deque
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+
+
+START_URL = "https://sana.sy"
+MAX_PAGES = 10000   
+REQUEST_DELAY_SECONDS = 0.4
+TIMEOUT_SECONDS = 15
+OUTPUT_FILE = "site_data2.json"
+VERIFY_SSL = False
+
+
+def is_same_domain(url: str, base_netloc: str) -> bool:
+    return urlparse(url).netloc == base_netloc
+
+
+def normalize_url(url: str) -> str:
+    parsed = urlparse(url)
+    clean = parsed._replace(fragment="", query="")
+    return clean.geturl().rstrip("/")
+
+
+def extract_text(soup: BeautifulSoup) -> str:
+    for tag in soup(["script", "style", "noscript"]):
+        tag.decompose()
+    text = soup.get_text(separator=" ", strip=True)
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def crawl_site(start_url: str, max_pages: int = 100):
+    session = requests.Session()
+    session.headers.update(
+        {
+            "User-Agent": "Mozilla/5.0 (compatible; simple-crawler/1.0; +https://example.com/bot)"
+        }
+    )
+    if not VERIFY_SSL:
+        requests.packages.urllib3.disable_warnings()
+
+    base_netloc = urlparse(start_url).netloc
+    queue = deque([normalize_url(start_url)])
+    visited = set()
+    results = []
+
+    while queue and len(visited) < max_pages:
+        current_url = queue.popleft()
+        if current_url in visited:
+            continue
+
+        try:
+            response = session.get(
+                current_url,
+                timeout=TIMEOUT_SECONDS,
+                verify=VERIFY_SSL,
+            )
+            response.raise_for_status()
+        except requests.RequestException as exc:
+            print(f"[SKIP] {current_url} -> {exc}")
+            visited.add(current_url)
+            continue
+
+        content_type = response.headers.get("Content-Type", "")
+        if "text/html" not in content_type:
+            visited.add(current_url)
+            continue
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        page_title = soup.title.get_text(strip=True) if soup.title else ""
+        page_text = extract_text(soup)
+
+        results.append(
+            {
+                "site_url":START_URL,
+                "url": current_url,
+                "title": page_title,
+                "text": page_text,
+            }
+        )
+
+        visited.add(current_url)
+        print(f"[OK] ({len(visited)}/{max_pages}) {current_url}")
+
+        for a_tag in soup.find_all("a", href=True):
+            href = a_tag["href"].strip()
+            absolute_url = normalize_url(urljoin(current_url, href))
+            parsed = urlparse(absolute_url)
+            if parsed.scheme not in {"http", "https"}:
+                continue
+            if not is_same_domain(absolute_url, base_netloc):
+                continue
+            if absolute_url not in visited:
+                queue.append(absolute_url)
+
+        time.sleep(REQUEST_DELAY_SECONDS)
+
+    return results
+
+
+if __name__ == "__main__":
+    data = crawl_site(START_URL, MAX_PAGES)
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    print(f"\nSaved {len(data)} pages to {OUTPUT_FILE}")