نسخ من boutmoun/ArabicNLPResources
Add boutmoun12.py and clean_data_spacy.py
هذا الالتزام موجود في:
109
boutmoun12.py
Normal file
109
boutmoun12.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from collections import deque
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
START_URL = "https://sana.sy"
|
||||
MAX_PAGES = 10000
|
||||
REQUEST_DELAY_SECONDS = 0.4
|
||||
TIMEOUT_SECONDS = 15
|
||||
OUTPUT_FILE = "site_data2.json"
|
||||
VERIFY_SSL = False
|
||||
|
||||
|
||||
def is_same_domain(url: str, base_netloc: str) -> bool:
|
||||
return urlparse(url).netloc == base_netloc
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
clean = parsed._replace(fragment="", query="")
|
||||
return clean.geturl().rstrip("/")
|
||||
|
||||
|
||||
def extract_text(soup: BeautifulSoup) -> str:
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def crawl_site(start_url: str, max_pages: int = 100):
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": "Mozilla/5.0 (compatible; simple-crawler/1.0; +https://example.com/bot)"
|
||||
}
|
||||
)
|
||||
if not VERIFY_SSL:
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
|
||||
base_netloc = urlparse(start_url).netloc
|
||||
queue = deque([normalize_url(start_url)])
|
||||
visited = set()
|
||||
results = []
|
||||
|
||||
while queue and len(visited) < max_pages:
|
||||
current_url = queue.popleft()
|
||||
if current_url in visited:
|
||||
continue
|
||||
|
||||
try:
|
||||
response = session.get(
|
||||
current_url,
|
||||
timeout=TIMEOUT_SECONDS,
|
||||
verify=VERIFY_SSL,
|
||||
)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as exc:
|
||||
print(f"[SKIP] {current_url} -> {exc}")
|
||||
visited.add(current_url)
|
||||
continue
|
||||
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
if "text/html" not in content_type:
|
||||
visited.add(current_url)
|
||||
continue
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
page_title = soup.title.get_text(strip=True) if soup.title else ""
|
||||
page_text = extract_text(soup)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"site_url":START_URL,
|
||||
"url": current_url,
|
||||
"title": page_title,
|
||||
"text": page_text,
|
||||
}
|
||||
)
|
||||
|
||||
visited.add(current_url)
|
||||
print(f"[OK] ({len(visited)}/{max_pages}) {current_url}")
|
||||
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"].strip()
|
||||
absolute_url = normalize_url(urljoin(current_url, href))
|
||||
parsed = urlparse(absolute_url)
|
||||
if parsed.scheme not in {"http", "https"}:
|
||||
continue
|
||||
if not is_same_domain(absolute_url, base_netloc):
|
||||
continue
|
||||
if absolute_url not in visited:
|
||||
queue.append(absolute_url)
|
||||
|
||||
time.sleep(REQUEST_DELAY_SECONDS)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = crawl_site(START_URL, MAX_PAGES)
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
print(f"\nSaved {len(data)} pages to {OUTPUT_FILE}")
|
||||
المرجع في مشكلة جديدة
حظر مستخدم