diff --git a/mithal_search_automation.py b/mithal_search_automation.py new file mode 100644 index 0000000..244b8ef --- /dev/null +++ b/mithal_search_automation.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Automate 100 Arabic search queries on https://mithal.space/ for research. +Uses Playwright (sync API) to capture screenshots and structured result data. +""" + +from __future__ import annotations + +# ============================================================================= +# STEP 1 – Verify the Playwright Python package is importable +# ============================================================================= +import sys + +try: + import playwright # noqa: F401 – package presence check only here +except ImportError: + print("=" * 70) + print("ERROR: Playwright Python package is NOT installed for this Python.") + print("=" * 70) + print() + print("You are running:") + print(f" {sys.executable}") + print() + print("Install Playwright with the SAME interpreter you use to run this script:") + print() + print(f' "{sys.executable}" -m pip install playwright') + print(f' "{sys.executable}" -m playwright install') + print() + print("If `playwright` is not found as a command, always use -m playwright instead.") + print("=" * 70) + sys.exit(1) + +# Safe to import sync API after the package check above +from playwright.sync_api import TimeoutError as PlaywrightTimeoutError +from playwright.sync_api import sync_playwright + +import csv +import json +import logging +import os +import re +import time +import traceback +from datetime import datetime +from pathlib import Path +from urllib.parse import quote + +# Base URL and timing constants (milliseconds unless noted) +BASE_URL = "https://mithal.space/search" +NAVIGATION_TIMEOUT_MS = 15_000 +RESULTS_WAIT_TIMEOUT_MS = 10_000 +DELAY_BETWEEN_QUERIES_SEC = 2 +MAX_RETRIES = 2 # up to 2 retries after the first attempt (3 tries total) +BROWSER_LAUNCH_TIMEOUT_MS = 10_000 +STARTUP_TEST_TIMEOUT_MS = 15_000 + +# CSS selectors for mithal.space (with fallbacks) +RESULTS_CONTAINER_SELECTORS = [ + "#results-container", + ".results-container", + "#visible-results", + "a.result-link", + ".result", + "div[class*='result']", +] + +# JavaScript run in the page to extract results (Arabic-safe via browser DOM) +EXTRACT_RESULTS_JS = r"""() => { + const norm = (s) => (s || '').replace(/\s+/g, ' ').trim(); + + const links = Array.from(document.querySelectorAll('a.result-link')); + const results = links.slice(0, 10).map((anchor) => { + const box = anchor.querySelector('.result') || anchor; + const titleEl = box.querySelector('h3'); + const snippetEl = box.querySelector('p'); + const urlEl = box.querySelector('small.arabic-url'); + const href = anchor.getAttribute('href') || ''; + const dataUrl = urlEl ? urlEl.getAttribute('data-url') : ''; + return { + title: norm(titleEl ? titleEl.innerText : anchor.innerText.split('\n')[0]), + url: norm(dataUrl || href), + snippet: norm(snippetEl ? snippetEl.innerText : ''), + }; + }); + + let resultCount = null; + const bodyText = document.body ? document.body.innerText : ''; + + const countPatterns = [ + /(\d[\d,\.]*)\s*نتيجة/iu, + /(\d[\d,\.]*)\s*نتائج/iu, + /نتائج\s*[::]?\s*(\d[\d,\.]*)/iu, + /(\d[\d,\.]*)\s*results?/i, + /about\s+(\d[\d,\.]*)\s+results?/i, + ]; + for (const re of countPatterns) { + const m = bodyText.match(re); + if (m) { + resultCount = parseInt(m[1].replace(/[,\.]/g, ''), 10); + if (!Number.isNaN(resultCount)) break; + } + } + + if (resultCount === null) { + const stats = document.querySelector('.search-header-stats'); + if (stats) { + const digits = (stats.innerText || '').match(/\d+/); + if (digits) resultCount = parseInt(digits[0], 10); + } + } + + if (resultCount === null) { + const itemCount = document.querySelectorAll('a.result-link, .result').length; + if (itemCount > 0) resultCount = itemCount; + } + + const special = []; + const spellPatterns = [/هل تقصد/iu, /did you mean/i, /ربما تقصد/iu]; + for (const el of document.querySelectorAll('a, p, div, span, h2, h3, h4')) { + const t = norm(el.innerText); + if (!t || t.length > 300) continue; + if (spellPatterns.some((re) => re.test(t))) { + special.push({ type: 'spell_suggestion', text: t }); + break; + } + } + + for (const sel of ['.related-searches', '[class*="related"]', '[id*="related"]']) { + const block = document.querySelector(sel); + if (block && norm(block.innerText)) { + special.push({ type: 'related_searches', text: norm(block.innerText).slice(0, 500) }); + break; + } + } + + for (const sel of ['.knowledge-panel', '[class*="knowledge"]', '[id*="knowledge"]']) { + const block = document.querySelector(sel); + if (block && norm(block.innerText)) { + special.push({ type: 'knowledge_panel', text: norm(block.innerText).slice(0, 500) }); + break; + } + } + + const aiPanel = document.querySelector('.ai-mode-tab.active, [class*="ai-response"], #aiResponse'); + if (aiPanel && norm(aiPanel.innerText)) { + special.push({ type: 'ai_mode', text: norm(aiPanel.innerText).slice(0, 300) }); + } + + return { result_count: resultCount, results, special_features: special }; +}""" + + +def _is_browser_missing_error(exc: BaseException) -> bool: + """Detect Playwright errors that mean browser binaries were not installed.""" + msg = str(exc).lower() + needles = ( + "executable doesn't exist", + "executable does not exist", + "browser not found", + "failed to launch", + "please run the following command to download", + "playwright install", + "browserType.launch", + ) + return any(n in msg for n in needles) + + +def _get_playwright_package_version() -> str: + """Read installed playwright version from package metadata.""" + try: + from importlib.metadata import version + + return version("playwright") + except Exception: + return "unknown (could not read package metadata)" + + +def print_playwright_debug_info(playwright_instance) -> None: + """Print version, Python path, and browser install locations for debugging.""" + print("\n" + "=" * 70) + print("PLAYWRIGHT DEBUG INFO") + print("=" * 70) + print(f"Playwright package version: {_get_playwright_package_version()}") + print(f"Python executable: {sys.executable}") + print(f"Python version: {sys.version.split()[0]}") + + env_browsers = os.environ.get("PLAYWRIGHT_BROWSERS_PATH") + if env_browsers: + print(f"PLAYWRIGHT_BROWSERS_PATH: {env_browsers}") + else: + if sys.platform == "win32": + default_path = Path.home() / "AppData" / "Local" / "ms-playwright" + elif sys.platform == "darwin": + default_path = Path.home() / "Library" / "Caches" / "ms-playwright" + else: + default_path = Path.home() / ".cache" / "ms-playwright" + print(f"Default browsers folder: {default_path}") + print(" (override with PLAYWRIGHT_BROWSERS_PATH environment variable)") + + try: + chromium_path = playwright_instance.chromium.executable_path + print(f"Chromium executable path: {chromium_path}") + print(f" exists on disk: {Path(chromium_path).exists()}") + except Exception as exc: + print(f"Chromium executable path: (could not resolve: {exc})") + + print("=" * 70 + "\n") + + +def verify_browsers_installed(playwright_instance) -> None: + """ + STEP 2 – Try launching Chromium with a short timeout. + Exit with install instructions if browser binaries are missing. + """ + print("Checking Chromium browser installation...") + try: + browser = playwright_instance.chromium.launch( + headless=True, + timeout=BROWSER_LAUNCH_TIMEOUT_MS, + ) + browser.close() + print("Chromium launched successfully.\n") + except Exception as exc: + print("\n" + "=" * 70) + print("ERROR: Could not launch Chromium browser.") + print("=" * 70) + print(f"\nExact error:\n {type(exc).__name__}: {exc}\n") + + if _is_browser_missing_error(exc): + print("Browser binaries appear to be missing or incomplete.") + print("\nRun these commands with the SAME Python you use for this script:\n") + print(f' "{sys.executable}" -m playwright install') + print(f' "{sys.executable}" -m playwright install chromium') + print("\nOptional – install all browsers:") + print(f' "{sys.executable}" -m playwright install --with-deps') + else: + print("If this is unexpected, try reinstalling browsers:") + print(f' "{sys.executable}" -m playwright install chromium') + traceback.print_exc() + + print("=" * 70) + sys.exit(1) + + +def run_startup_browser_test(playwright_instance, test_dir: Path) -> None: + """ + STEP 3 – Quick smoke test: open example.com, screenshot, close. + Exits with the exact error if anything fails. + """ + print("Running startup browser test (https://example.com)...") + test_dir.mkdir(parents=True, exist_ok=True) + test_screenshot = test_dir / "_playwright_startup_test.png" + + try: + with playwright_instance.chromium.launch( + headless=True, + timeout=BROWSER_LAUNCH_TIMEOUT_MS, + ) as browser: + with browser.new_context() as context: + page = context.new_page() + page.goto( + "https://example.com", + wait_until="domcontentloaded", + timeout=STARTUP_TEST_TIMEOUT_MS, + ) + page.screenshot(path=str(test_screenshot)) + title = page.title() + print(f"Startup test PASSED (page title: {title!r})") + print(f"Test screenshot saved: {test_screenshot.resolve()}\n") + except Exception as exc: + print("\n" + "=" * 70) + print("ERROR: Startup browser test FAILED.") + print("=" * 70) + print(f"\nExact error:\n {type(exc).__name__}: {exc}\n") + traceback.print_exc() + print("\nFix suggestions:") + print(f' 1. "{sys.executable}" -m playwright install chromium') + print(" 2. Check firewall / proxy if navigation to example.com fails") + print(" 3. Confirm you run the script with the same Python that has playwright:") + print(f' "{sys.executable}" mithal_search_automation.py') + print("=" * 70) + sys.exit(1) + + +def generate_arabic_topics() -> list[str]: + """ + Build exactly 100 diverse Arabic search queries in-code (no external file). + Twelve thematic buckets; each bucket contributes several phrases. + """ + category_phrases: dict[str, list[str]] = { + "science": [ + "الفيزياء الكمية", + "تلسكوب جيمس ويب", + "نظرية النسبية", + "الخلايا الجذعية", + "تغير المناخ والعلوم", + "الحمض النووي DNA", + "الطاقة الشمسية", + "المجرة والثقوب السوداء", + "التطعيمات والمناعة", + ], + "technology": [ + "الذكاء الاصطناعي", + "تعلم الآلة", + "الحوسبة السحابية", + "الأمن السيبراني", + "بلوك تشين", + "إنترنت الأشياء", + "الواقع الافتراضي", + "5G والاتصالات", + "روبوتات الخدمة", + ], + "history": [ + "الحضارة الإسلامية", + "الدولة العباسية", + "الحرب العالمية الأولى", + "الحرب العالمية الثانية", + "الحضارة المصرية القديمة", + "الأندلس", + "الدولة العثمانية", + "ثورة التكنولوجيا الصناعية", + "تاريخ الخليج العربي", + ], + "culture": [ + "الأدب العربي", + "الشعر الجاهلي", + "المطبخ العربي", + "التراث الشعبي", + "الخط العربي", + "الموسيقى العربية", + "الأزياء التقليدية", + "الاحتفالات الوطنية", + "اللغة العربية الفصحى", + ], + "religion": [ + "القرآن الكريم", + "الحديث النبوي", + "الصلاة وأحكامها", + "الزكاة والصدقة", + "الحج والعمرة", + "الأخلاق الإسلامية", + "الفقه الإسلامي", + "السيرة النبوية", + "الإيمان والتوحيد", + ], + "sports": [ + "كرة القدم العالمية", + "دوري روشن السعودي", + "كأس العالم FIFA", + "الألعاب الأولمبية", + "كرة السلة NBA", + "التنس وجائزة ويمبلدون", + "السباقات والماراثون", + "الرياضة النسائية", + "اللياقة البدنية", + ], + "business": [ + "ريادة الأعمال", + "التسويق الرقمي", + "الشركات الناشئة", + "الاستثمار في الأسهم", + "إدارة المشاريع", + "التجارة الإلكترونية", + "ريادة الأعمال الاجتماعية", + "رؤية السعودية 2030", + "الابتكار في الأعمال", + ], + "health": [ + "التغذية الصحية", + "السكري وطرق الوقاية", + "الصحة النفسية", + "اللقاحات والأوبئة", + "الرياضة والصحة", + "النوم وجودته", + "أمراض القلب", + "الطب عن بعد", + "الصحة العامة", + ], + "arts": [ + "الرسم والفن التشكيلي", + "السينما العربية", + "المسرح والدراما", + "التصوير الفوتوغرافي", + "العمارة الإسلامية", + "النحت والتركيب", + "الفن الرقمي", + "الأوبرا والباليه", + "المتاحف والمعارض", + ], + "politics": [ + "الأمم المتحدة", + "حقوق الإنسان", + "الدبلوماسية الدولية", + "الانتخابات والديمقراطية", + "العلاقات العربية", + "القانون الدولي", + "السياسة الخارجية", + "الحكم الرشيد", + "السلام والنزاعات", + ], + "environment": [ + "إعادة التدوير", + "التنوع البيولوجي", + "الطاقة المتجددة", + "تلوث الهواء", + "حماية المحيطات", + "الغابات والتصحر", + "المياه العذبة", + "الاستدامة البيئية", + "انبعاثات الكربون", + ], + "daily_life": [ + "وصفات طبخ سريعة", + "تربية الأطفال", + "إدارة الوقت", + "السفر والسياحة", + "التسوق عبر الإنترنت", + "العمل من المنزل", + "الادخار والميزانية", + "الحيوانات الأليفة", + "الحدائق المنزلية", + ], + } + + modifiers = [ + "{phrase}", + "ما هو {phrase}", + "أفضل {phrase}", + "تاريخ {phrase}", + "دليل {phrase} للمبتدئين", + ] + + topics: list[str] = [] + for category_index, (_category, phrases) in enumerate(category_phrases.items()): + for phrase_index, phrase in enumerate(phrases): + template = modifiers[(category_index + phrase_index) % len(modifiers)] + topics.append(template.format(phrase=phrase)) + + return topics[:100] + + +def sanitize_filename(topic: str, max_length: int = 80) -> str: + """Make a Windows-safe filename stem from an Arabic topic string.""" + invalid_chars = r'<>:"/\|?*' + cleaned = "".join("_" if ch in invalid_chars else ch for ch in topic) + cleaned = re.sub(r"\s+", " ", cleaned).strip().strip(".") + if not cleaned: + cleaned = "topic" + if len(cleaned) > max_length: + cleaned = cleaned[:max_length].rstrip() + return cleaned + + +def build_search_url(topic: str) -> str: + """URL-encode the Arabic query for the search endpoint.""" + return f"{BASE_URL}?q={quote(topic, safe='')}" + + +def setup_output_dirs() -> tuple[Path, Path, Path]: + """Create output/YYYY-MM-DD_HH-MM-SS/ with screenshots/ subfolder.""" + run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + run_dir = Path("output") / run_timestamp + screenshots_dir = run_dir / "screenshots" + screenshots_dir.mkdir(parents=True, exist_ok=True) + csv_path = run_dir / "data.csv" + return run_dir, screenshots_dir, csv_path + + +def setup_logging(log_path: Path) -> logging.Logger: + """Plain-text log file plus console output for actions and errors.""" + logger = logging.getLogger("mithal_search") + logger.setLevel(logging.DEBUG) + logger.handlers.clear() + + formatter = logging.Formatter( + "%(asctime)s | %(levelname)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + file_handler = logging.FileHandler(log_path, encoding="utf-8") + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + return logger + + +def wait_for_results_container(page, logger: logging.Logger) -> str | None: + """ + Wait until any known results indicator appears, or fall back after ~10 seconds. + Returns the selector that matched, or None on timeout. + """ + per_selector_ms = max(2000, RESULTS_WAIT_TIMEOUT_MS // len(RESULTS_CONTAINER_SELECTORS)) + + for selector in RESULTS_CONTAINER_SELECTORS: + try: + page.wait_for_selector(selector, timeout=per_selector_ms) + logger.debug("Results container found via: %s", selector) + return selector + except PlaywrightTimeoutError: + continue + + logger.warning( + "No results container matched within %sms; continuing anyway.", + RESULTS_WAIT_TIMEOUT_MS, + ) + return None + + +def extract_page_data(page) -> dict: + """Pull result count, top 10 hits, and special UI blocks from the live DOM.""" + return page.evaluate(EXTRACT_RESULTS_JS) + + +def process_single_query( + page, + topic: str, + screenshots_dir: Path, + logger: logging.Logger, + used_filenames: set[str], +) -> dict: + """ + Navigate, wait, screenshot, and extract data for one topic. + Retries up to MAX_RETRIES times on failure. + """ + search_url = build_search_url(topic) + last_error = "" + + for attempt in range(1, MAX_RETRIES + 2): + try: + logger.info("Attempt %s | %s | %s", attempt, topic[:60], search_url) + + page.goto( + search_url, + wait_until="domcontentloaded", + timeout=NAVIGATION_TIMEOUT_MS, + ) + + matched = wait_for_results_container(page, logger) + if matched is None: + logger.warning("Proceeding without confirmed results container for: %s", topic) + + page.wait_for_timeout(500) + + stem = sanitize_filename(topic) + if stem in used_filenames: + stem = f"{stem}_{attempt}" + used_filenames.add(stem) + screenshot_path = screenshots_dir / f"{stem}.png" + page.screenshot(path=str(screenshot_path), full_page=True) + logger.info("Screenshot saved: %s", screenshot_path) + + extracted = extract_page_data(page) + return { + "success": True, + "error_message": "", + "search_url": search_url, + "result_count": extracted.get("result_count"), + "results": extracted.get("results", []), + "special_features": extracted.get("special_features", []), + "screenshot": str(screenshot_path), + } + + except PlaywrightTimeoutError as exc: + last_error = f"Timeout: {exc}" + logger.warning("Attempt %s timed out for '%s': %s", attempt, topic, exc) + except Exception as exc: + last_error = f"{type(exc).__name__}: {exc}" + logger.warning("Attempt %s failed for '%s': %s", attempt, topic, exc) + + if attempt <= MAX_RETRIES: + logger.info("Retrying (%s/%s)...", attempt, MAX_RETRIES) + time.sleep(1) + + return { + "success": False, + "error_message": last_error, + "search_url": search_url, + "result_count": None, + "results": [], + "special_features": [], + "screenshot": "", + } + + +def write_csv_row( + writer: csv.DictWriter, + row_timestamp: str, + topic: str, + payload: dict, +) -> None: + """Append one row to data.csv with UTF-8-safe JSON in results_json.""" + special_text = json.dumps(payload.get("special_features", []), ensure_ascii=False) + results_json = json.dumps(payload.get("results", []), ensure_ascii=False) + + writer.writerow( + { + "timestamp": row_timestamp, + "topic": topic, + "search_url": payload.get("search_url", ""), + "result_count": payload.get("result_count") if payload.get("result_count") is not None else "", + "success": payload.get("success", False), + "error_message": payload.get("error_message", ""), + "special_features": special_text, + "results_json": results_json, + } + ) + + +def run_automation() -> None: + """Main entry: preflight checks, then 100 queries with sync Playwright.""" + topics = generate_arabic_topics() + if len(topics) != 100: + print(f"Warning: expected 100 topics, got {len(topics)}.") + + run_dir, screenshots_dir, csv_path = setup_output_dirs() + log_path = run_dir / "log.txt" + logger = setup_logging(log_path) + + logger.info("Starting mithal.space search automation") + logger.info("Output directory: %s", run_dir.resolve()) + logger.info("Total topics: %s", len(topics)) + + csv_columns = [ + "timestamp", + "topic", + "search_url", + "result_count", + "success", + "error_message", + "special_features", + "results_json", + ] + + success_count = 0 + fail_count = 0 + used_filenames: set[str] = set() + + # sync_playwright() is a context manager – ensures driver cleanup on exit + with sync_playwright() as playwright_instance: + print_playwright_debug_info(playwright_instance) + verify_browsers_installed(playwright_instance) + run_startup_browser_test(playwright_instance, run_dir) + + logger.info("Preflight checks passed; starting 100 search queries") + + # Nested context managers: browser and context always close cleanly + with playwright_instance.chromium.launch( + headless=True, + timeout=BROWSER_LAUNCH_TIMEOUT_MS, + ) as browser: + with browser.new_context( + locale="ar-SA", + viewport={"width": 1280, "height": 900}, + ) as context: + page = context.new_page() + + with csv_path.open("w", newline="", encoding="utf-8-sig") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=csv_columns) + writer.writeheader() + + for index, topic in enumerate(topics, start=1): + row_timestamp = datetime.now().isoformat(timespec="seconds") + logger.info("--- [%s/100] %s ---", index, topic) + + payload = process_single_query( + page, topic, screenshots_dir, logger, used_filenames + ) + + if payload["success"]: + success_count += 1 + else: + fail_count += 1 + try: + stem = sanitize_filename(topic) + "_failed" + fail_path = screenshots_dir / f"{stem}.png" + page.screenshot(path=str(fail_path), full_page=True) + logger.info("Failure screenshot: %s", fail_path) + except Exception as shot_exc: + logger.debug("Could not save failure screenshot: %s", shot_exc) + + write_csv_row(writer, row_timestamp, topic, payload) + csv_file.flush() + + time.sleep(DELAY_BETWEEN_QUERIES_SEC) + + print("\n" + "=" * 60) + print("MITHAL.SPACE SEARCH AUTOMATION – SUMMARY") + print("=" * 60) + print(f"Total topics: {len(topics)}") + print(f"Successful queries: {success_count}") + print(f"Failed queries: {fail_count}") + print(f"Output directory: {run_dir.resolve()}") + print(f" - CSV: {csv_path.name}") + print(f" - Log: {log_path.name}") + print(f" - Screenshots: {screenshots_dir.name}/") + print("=" * 60) + + logger.info( + "Finished. success=%s failed=%s dir=%s", + success_count, + fail_count, + run_dir.resolve(), + ) + + +def main() -> None: + """Run preflight + automation (sync – no asyncio event loop).""" + run_automation() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..867e569 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +playwright>=1.40.0