Add boutmoun12.py and clean_data_spacy.py

2026-03-09 15:25:02 +03:00
--- a//arabic.py
+++ b//arabic.py
@@ -1,68 +0,0 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import time
 import json
 #start------------------------------------------------------------------------------------
 start_url = "https://dgam.gov.sy/?page_id=9606" 
 headers = {
    "User-Agent": "Mozilla/5.0"
 }
 visited = set()
 to_visit = [start_url]
 results = []
 #claeaning-------------------------------------------------------------------------------
 def clean_text(text):
    return " ".join(text.split())
 #crawler-------------------------------------------------------------------------------------
 while to_visit:
    url = to_visit.pop(0)
    if url in visited:
        continue
    print("VISITING:", url)
    try:
        response = requests.get(url, headers=headers, timeout=10)
    except Exception as e:
        print("Error:", e)
        continue
    soup = BeautifulSoup(response.text, "html.parser")
 # text-------------------------------------
    paragraphs = soup.find_all("p")
    text = "\n".join(clean_text(p.get_text()) for p in paragraphs if len(p.get_text(strip=True)) > 10)
    if text:
        results.append({
            "url": url,
            "content": text
        })
 #links-----------------------------------------------------
    for link in soup.find_all("a", href=True):
        new_url = urljoin(url, link["href"])
        # نتحقق أن الرابط ضمن نفس الدومين
        if urlparse(new_url).netloc == urlparse(start_url).netloc:
            if new_url not in visited and new_url not in to_visit:
                to_visit.append(new_url)
    visited.add(url)
    time.sleep(1)
 #json---------------------------------------------------------------
 with open("site_full_data.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
 print("SCRAPING FINISHED. Total pages:", len(results))
--- a//site_full_data.json
+++ b//site_full_data.json
--- a/boutmoun12.py
+++ b/boutmoun12.py
@@ -0,0 +1,109 @@
 import json
 import re
 import time
 from collections import deque
 from urllib.parse import urljoin, urlparse
 import requests
 from bs4 import BeautifulSoup
 START_URL = "https://sana.sy"
 MAX_PAGES = 10000   
 REQUEST_DELAY_SECONDS = 0.4
 TIMEOUT_SECONDS = 15
 OUTPUT_FILE = "site_data2.json"
 VERIFY_SSL = False
 def is_same_domain(url: str, base_netloc: str) -> bool:
    return urlparse(url).netloc == base_netloc
 def normalize_url(url: str) -> str:
    parsed = urlparse(url)
    clean = parsed._replace(fragment="", query="")
    return clean.geturl().rstrip("/")
 def extract_text(soup: BeautifulSoup) -> str:
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator=" ", strip=True)
    return re.sub(r"\s+", " ", text).strip()
 def crawl_site(start_url: str, max_pages: int = 100):
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": "Mozilla/5.0 (compatible; simple-crawler/1.0; +https://example.com/bot)"
        }
    )
    if not VERIFY_SSL:
        requests.packages.urllib3.disable_warnings()
    base_netloc = urlparse(start_url).netloc
    queue = deque([normalize_url(start_url)])
    visited = set()
    results = []
    while queue and len(visited) < max_pages:
        current_url = queue.popleft()
        if current_url in visited:
            continue
        try:
            response = session.get(
                current_url,
                timeout=TIMEOUT_SECONDS,
                verify=VERIFY_SSL,
            )
            response.raise_for_status()
        except requests.RequestException as exc:
            print(f"[SKIP] {current_url} -> {exc}")
            visited.add(current_url)
            continue
        content_type = response.headers.get("Content-Type", "")
        if "text/html" not in content_type:
            visited.add(current_url)
            continue
        soup = BeautifulSoup(response.text, "html.parser")
        page_title = soup.title.get_text(strip=True) if soup.title else ""
        page_text = extract_text(soup)
        results.append(
            {
                "site_url":START_URL,
                "url": current_url,
                "title": page_title,
                "text": page_text,
            }
        )
        visited.add(current_url)
        print(f"[OK] ({len(visited)}/{max_pages}) {current_url}")
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"].strip()
            absolute_url = normalize_url(urljoin(current_url, href))
            parsed = urlparse(absolute_url)
            if parsed.scheme not in {"http", "https"}:
                continue
            if not is_same_domain(absolute_url, base_netloc):
                continue
            if absolute_url not in visited:
                queue.append(absolute_url)
        time.sleep(REQUEST_DELAY_SECONDS)
    return results
 if __name__ == "__main__":
    data = crawl_site(START_URL, MAX_PAGES)
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"\nSaved {len(data)} pages to {OUTPUT_FILE}")
--- a/clean_data_spacy.py
+++ b/clean_data_spacy.py
@@ -0,0 +1,258 @@
 import argparse
 import json
 import re
 from pathlib import Path
 from urllib.parse import urlparse
 import spacy
 NOISE_SNIPPETS = [
    "القائمة الرئيسية",
    "جميع الحقوق محفوظة",
    "حقوق النشر",
    "تواصل معنا",
    "AR EN",
    "KO KO",
 ]
 def normalize_url(u: str) -> str:
    p = urlparse((u or "").strip())
    p = p._replace(fragment="", query="")
    return p.geturl().rstrip("/")
 def fix_mojibake(text: str) -> str:
    if not text:
        return ""
    candidates = [text]
    for src_enc in ("latin1", "cp1252"):
        try:
            decoded = text.encode(src_enc, errors="ignore").decode("utf-8", errors="ignore")
            if decoded:
                candidates.append(decoded)
        except Exception:
            pass
    def score(s: str) -> int:
        arabic = len(re.findall(r"[\u0600-\u06FF]", s))
        broken = sum(s.count(ch) for ch in ("Ø", "Ù", "ظ", "ط", "Ã", "Â"))
        return arabic - (2 * broken)
    return max(candidates, key=score)
 def strip_noise(text: str) -> str:
    t = text
    for snippet in NOISE_SNIPPETS:
        t = t.replace(snippet, " ")
        t = t.replace(snippet.lower(), " ")
    return re.sub(r"\s+", " ", t).strip()
 def remove_terms(text: str, terms):
    t = text
    for term in terms:
        if not term:
            continue
        t = t.replace(term, " ")
    return re.sub(r"\s+", " ", t).strip()
 def strip_signs(text: str) -> str:
    # Keep Arabic, English letters, and digits; remove punctuation/symbols.
    t = re.sub(r"[^\w\s\u0600-\u06FF]", " ", text)
    return re.sub(r"\s+", " ", t).strip()
 def dedupe_sentences(text: str) -> str:
    # Remove repeated sentence-like chunks while preserving order.
    chunks = re.split(r"(?<=[\.\!\?؟؛])\s+|\n+", text)
    seen = set()
    out = []
    for chunk in chunks:
        c = re.sub(r"\s+", " ", chunk).strip()
        if not c:
            continue
        key = c.lower()
        if key in seen:
            continue
        seen.add(key)
        out.append(c)
    return " ".join(out).strip()
 def split_chunks(text: str):
    chunks = re.split(r"(?<=[\.\!\?؟؛])\s+|\n+", text)
    out = []
    for c in chunks:
        c = re.sub(r"\s+", " ", c).strip()
        if c:
            out.append(c)
    return out
 def remove_global_repeated_chunks(text: str, repeated_chunks):
    chunks = split_chunks(text)
    filtered = [c for c in chunks if c not in repeated_chunks]
    return " ".join(filtered).strip()
 def detokenize(tokens):
    no_space_before = {
        ".",
        ",",
        ":",
        ";",
        "!",
        "?",
        ")",
        "]",
        "}",
        "،",
        "؛",
        "؟",
    }
    no_space_after = {"(", "[", "{"}
    out = []
    for tok in tokens:
        if not out:
            out.append(tok)
            continue
        prev = out[-1]
        if tok in no_space_before or prev in no_space_after:
            out[-1] = prev + tok
        else:
            out.append(" " + tok)
    return "".join(out).strip()
 def clean_text_with_spacy(nlp, text: str) -> str:
    doc = nlp(text)
    tokens = [t.text for t in doc if not t.is_space]
    return detokenize(tokens)
 def clean_records(
    records,
    terms_to_remove,
    strip_signs_flag: bool,
    cross_record_repeat_min: int,
 ):
    nlp = spacy.blank("xx")
    cleaned = []
    for row in records:
        if not isinstance(row, dict):
            continue
        site_url = normalize_url(str(row.get("site_url", "")))
        url = normalize_url(str(row.get("url", "")))
        title = clean_text_with_spacy(nlp, strip_noise(fix_mojibake(str(row.get("title", "")))))
        text = clean_text_with_spacy(nlp, strip_noise(fix_mojibake(str(row.get("text", "")))))
        if terms_to_remove:
            title = remove_terms(title, terms_to_remove)
            text = remove_terms(text, terms_to_remove)
        if strip_signs_flag:
            title = strip_signs(title)
            text = strip_signs(text)
        title = dedupe_sentences(title)
        text = dedupe_sentences(text)
        if not url:
            continue
        cleaned.append(
            {
                "site_url": site_url,
                "url": url,
                "title": title,
                "text": text,
            }
        )
    # Remove chunks repeated across many records without deleting records.
    chunk_doc_count = {}
    for idx, rec in enumerate(cleaned):
        uniq_chunks = set(split_chunks(rec["title"]) + split_chunks(rec["text"]))
        for ch in uniq_chunks:
            if len(ch) < 8:
                continue
            chunk_doc_count[ch] = chunk_doc_count.get(ch, 0) + 1
    repeated_chunks = {
        ch
        for ch, cnt in chunk_doc_count.items()
        if cnt >= cross_record_repeat_min
    }
    for rec in cleaned:
        rec["title"] = remove_global_repeated_chunks(rec["title"], repeated_chunks)
        rec["text"] = remove_global_repeated_chunks(rec["text"], repeated_chunks)
        rec["title"] = re.sub(r"\s+", " ", rec["title"]).strip()
        rec["text"] = re.sub(r"\s+", " ", rec["text"]).strip()
    return cleaned
 def main():
    parser = argparse.ArgumentParser(description="Clean crawled JSON data using spaCy tokenization.")
    parser.add_argument("--input", default="site_data_raw.json")
    parser.add_argument("--output", default="site_data_clean.json")
    parser.add_argument("--compact", action="store_true", help="Write compact JSON without indentation")
    parser.add_argument(
        "--remove-terms",
        default="",
        help="Comma-separated terms to remove from title/text, e.g. \"حمزة المصطفى,محمد\"",
    )
    parser.add_argument(
        "--strip-signs",
        action="store_true",
        help="Remove punctuation/symbol signs from title/text.",
    )
    parser.add_argument(
        "--cross-record-repeat-min",
        type=int,
        default=3,
        help="Remove chunks repeated in this many records or more (without deleting records).",
    )
    args = parser.parse_args()
    in_path = Path(args.input)
    if not in_path.exists():
        raise SystemExit(f"Input file not found: {args.input}")
    data = json.loads(in_path.read_text(encoding="utf-8-sig"))
    if not isinstance(data, list):
        raise SystemExit("Input JSON must be a list of objects.")
    terms_to_remove = [x.strip() for x in args.remove_terms.split(",") if x.strip()]
    cleaned = clean_records(
        data,
        terms_to_remove=terms_to_remove,
        strip_signs_flag=args.strip_signs,
        cross_record_repeat_min=args.cross_record_repeat_min,
    )
    out_path = Path(args.output)
    if args.compact:
        payload = json.dumps(cleaned, ensure_ascii=False, separators=(",", ":"))
    else:
        payload = json.dumps(cleaned, ensure_ascii=False, indent=2)
    out_path.write_text(payload, encoding="utf-8")
    print(f"Input records: {len(data)}")
    print(f"Cleaned records: {len(cleaned)}")
    print(f"Saved to: {args.output}")
 if __name__ == "__main__":
    main()