Update main.py
هذا الالتزام موجود في:
522
main.py
522
main.py
@@ -1,14 +1,24 @@
|
||||
# main.py
|
||||
import os
|
||||
import json
|
||||
import redis
|
||||
from fastapi import FastAPI, HTTPException, status
|
||||
from pydantic import BaseModel, Field, UrlConstraints
|
||||
import re
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from fastapi import FastAPI, HTTPException, status, BackgroundTasks
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from urllib.parse import urlparse
|
||||
import hashlib
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
# --- Configuration ---
|
||||
# We connect to Redis. In Docker, the hostname is usually the service name 'redis'.
|
||||
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
|
||||
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
|
||||
QUEUE_NAME = "arabic_crawling_queue"
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "")
|
||||
|
||||
# --- Logging Setup ---
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Models ---
|
||||
class SearchPayload(BaseModel):
|
||||
@@ -16,61 +26,475 @@ class SearchPayload(BaseModel):
|
||||
Input model matching your description.
|
||||
"""
|
||||
keyword: str = Field(..., description="The search keyword used")
|
||||
results: list[str] = Field(..., description="List of URLs to process")
|
||||
results: List[HttpUrl] = Field(..., description="List of URLs to process")
|
||||
|
||||
# --- App & Redis Setup ---
|
||||
class ArabicContentResponse(BaseModel):
|
||||
url: str
|
||||
has_arabic: bool
|
||||
title: Optional[str] = None
|
||||
meta_description: Optional[str] = None
|
||||
arabic_content_preview: Optional[str] = None
|
||||
|
||||
# --- App Setup ---
|
||||
app = FastAPI(title="Arabic Search Ingestion API")
|
||||
|
||||
# Initialize Redis connection
|
||||
try:
|
||||
redis_client = redis.Redis(
|
||||
host=REDIS_HOST,
|
||||
port=REDIS_PORT,
|
||||
db=0,
|
||||
decode_responses=True # Automatically decode bytes to str
|
||||
)
|
||||
# Test connection on startup
|
||||
redis_client.ping()
|
||||
print(f"Connected to Redis at {REDIS_HOST}:{REDIS_PORT}")
|
||||
except Exception as e:
|
||||
print(f"Could not connect to Redis: {e}")
|
||||
redis_client = None
|
||||
# --- Database Setup ---
|
||||
def get_db_connection():
|
||||
"""Create a database connection"""
|
||||
try:
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
return conn
|
||||
except Exception as e:
|
||||
logger.error(f"Database connection failed: {e}")
|
||||
return None
|
||||
|
||||
# --- Endpoints ---
|
||||
def init_database():
|
||||
"""Initialize database tables if they don't exist"""
|
||||
conn = get_db_connection()
|
||||
if not conn:
|
||||
logger.error("Cannot initialize database - no connection")
|
||||
return
|
||||
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
# Create table for raw search data
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS search_ingest (
|
||||
id SERIAL PRIMARY KEY,
|
||||
keyword VARCHAR(500) NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
url_hash VARCHAR(64) NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
processed BOOLEAN DEFAULT FALSE,
|
||||
UNIQUE(url_hash)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create index for faster lookups
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_search_ingest_url_hash
|
||||
ON search_ingest(url_hash)
|
||||
""")
|
||||
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_search_ingest_processed
|
||||
ON search_ingest(processed)
|
||||
""")
|
||||
|
||||
# Create table for Arabic content index
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS arabic_index (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
meta_description TEXT,
|
||||
arabic_content TEXT,
|
||||
detection_score FLOAT,
|
||||
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
source_keyword VARCHAR(500),
|
||||
http_status INTEGER,
|
||||
INDEX(url_hash)
|
||||
)
|
||||
""")
|
||||
|
||||
# Create table for processing queue
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS processing_queue (
|
||||
id SERIAL PRIMARY KEY,
|
||||
search_ingest_id INTEGER REFERENCES search_ingest(id),
|
||||
url TEXT NOT NULL,
|
||||
url_hash VARCHAR(64) NOT NULL,
|
||||
status VARCHAR(50) DEFAULT 'pending',
|
||||
attempts INTEGER DEFAULT 0,
|
||||
last_attempt TIMESTAMP,
|
||||
error_message TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(url_hash)
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
logger.info("Database initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Database initialization failed: {e}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Initialize database on startup
|
||||
@app.on_event("startup")
|
||||
def startup_event():
|
||||
init_database()
|
||||
|
||||
# --- Arabic Language Detection ---
|
||||
def contains_arabic(text: str) -> bool:
|
||||
"""
|
||||
Check if text contains Arabic characters.
|
||||
Arabic Unicode block: U+0600 to U+06FF
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Arabic Unicode range pattern
|
||||
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
|
||||
return bool(arabic_pattern.search(text))
|
||||
|
||||
def extract_arabic_content(text: str, max_length: int = 500) -> str:
|
||||
"""
|
||||
Extract Arabic content preview from text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
|
||||
matches = arabic_pattern.findall(text)
|
||||
|
||||
if matches:
|
||||
# Join matches and truncate
|
||||
arabic_text = " ".join(matches)
|
||||
return arabic_text[:max_length] + "..." if len(arabic_text) > max_length else arabic_text
|
||||
|
||||
return ""
|
||||
|
||||
def calculate_arabic_score(text: str) -> float:
|
||||
"""
|
||||
Calculate the percentage of Arabic characters in the text
|
||||
"""
|
||||
if not text or len(text) == 0:
|
||||
return 0.0
|
||||
|
||||
arabic_chars = len(re.findall(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]', text))
|
||||
total_chars = len(text)
|
||||
|
||||
return (arabic_chars / total_chars) * 100 if total_chars > 0 else 0.0
|
||||
|
||||
def get_url_hash(url: str) -> str:
|
||||
"""Create a unique hash for a URL"""
|
||||
return hashlib.sha256(url.encode('utf-8')).hexdigest()
|
||||
|
||||
# --- Scraping Function ---
|
||||
def scrape_and_check_arabic(url: str, keyword: str = None) -> dict:
|
||||
"""
|
||||
Scrape URL and check for Arabic content
|
||||
"""
|
||||
result = {
|
||||
"url": url,
|
||||
"has_arabic": False,
|
||||
"title": None,
|
||||
"meta_description": None,
|
||||
"arabic_content_preview": None,
|
||||
"arabic_score": 0.0,
|
||||
"http_status": None,
|
||||
"error": None
|
||||
}
|
||||
|
||||
try:
|
||||
# Set a user agent to avoid being blocked
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ArabicIndexBot/1.0; +http://example.com/bot)'
|
||||
}
|
||||
|
||||
# Make request with timeout
|
||||
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
|
||||
result["http_status"] = response.status_code
|
||||
|
||||
if response.status_code == 200:
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
title_tag = soup.find('title')
|
||||
if title_tag:
|
||||
result["title"] = title_tag.get_text().strip()
|
||||
|
||||
# Extract meta description
|
||||
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
||||
if meta_desc and meta_desc.get('content'):
|
||||
result["meta_description"] = meta_desc['content'].strip()
|
||||
|
||||
# Extract main content (remove script, style tags)
|
||||
for script in soup(["script", "style", "nav", "footer", "header"]):
|
||||
script.decompose()
|
||||
|
||||
# Get text content
|
||||
text_content = soup.get_text(separator=' ', strip=True)
|
||||
|
||||
# Check for Arabic
|
||||
has_arabic_title = contains_arabic(result["title"])
|
||||
has_arabic_meta = contains_arabic(result["meta_description"])
|
||||
has_arabic_content = contains_arabic(text_content)
|
||||
|
||||
result["has_arabic"] = has_arabic_title or has_arabic_meta or has_arabic_content
|
||||
|
||||
if result["has_arabic"]:
|
||||
# Calculate Arabic score
|
||||
result["arabic_score"] = calculate_arabic_score(text_content)
|
||||
|
||||
# Extract Arabic content preview
|
||||
arabic_preview = extract_arabic_content(text_content)
|
||||
result["arabic_content_preview"] = arabic_preview
|
||||
|
||||
# Save to database
|
||||
save_to_arabic_index(result, keyword)
|
||||
|
||||
except requests.Timeout:
|
||||
result["error"] = "Request timeout"
|
||||
except requests.RequestException as e:
|
||||
result["error"] = f"Request failed: {str(e)}"
|
||||
except Exception as e:
|
||||
result["error"] = f"Scraping failed: {str(e)}"
|
||||
|
||||
return result
|
||||
|
||||
def save_to_arabic_index(data: dict, keyword: str = None):
|
||||
"""Save Arabic content to the arabic_index table"""
|
||||
conn = get_db_connection()
|
||||
if not conn:
|
||||
logger.error("Cannot save to arabic_index - no database connection")
|
||||
return
|
||||
|
||||
try:
|
||||
url_hash = get_url_hash(data["url"])
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Use INSERT ... ON CONFLICT to handle duplicates
|
||||
cur.execute("""
|
||||
INSERT INTO arabic_index
|
||||
(url, url_hash, title, meta_description, arabic_content, detection_score, source_keyword, http_status, last_updated)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT (url_hash)
|
||||
DO UPDATE SET
|
||||
last_updated = CURRENT_TIMESTAMP,
|
||||
title = EXCLUDED.title,
|
||||
meta_description = EXCLUDED.meta_description,
|
||||
arabic_content = EXCLUDED.arabic_content,
|
||||
detection_score = EXCLUDED.detection_score,
|
||||
http_status = EXCLUDED.http_status,
|
||||
source_keyword = EXCLUDED.source_keyword
|
||||
""", (
|
||||
data["url"],
|
||||
url_hash,
|
||||
data["title"],
|
||||
data["meta_description"],
|
||||
data["arabic_content_preview"],
|
||||
data["arabic_score"],
|
||||
keyword,
|
||||
data["http_status"]
|
||||
))
|
||||
conn.commit()
|
||||
logger.info(f"Saved/Updated Arabic content for URL: {data['url']}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save to arabic_index: {e}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# --- Background Processing ---
|
||||
def process_urls_from_queue():
|
||||
"""Process pending URLs from the queue"""
|
||||
conn = get_db_connection()
|
||||
if not conn:
|
||||
logger.error("Cannot process queue - no database connection")
|
||||
return
|
||||
|
||||
try:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
# Get pending URLs (limit 10 per batch)
|
||||
cur.execute("""
|
||||
SELECT pq.id, pq.url, pq.url_hash, si.keyword
|
||||
FROM processing_queue pq
|
||||
JOIN search_ingest si ON pq.search_ingest_id = si.id
|
||||
WHERE pq.status = 'pending' AND pq.attempts < 3
|
||||
ORDER BY pq.created_at ASC
|
||||
LIMIT 10
|
||||
FOR UPDATE SKIP LOCKED
|
||||
""")
|
||||
|
||||
pending_items = cur.fetchall()
|
||||
|
||||
for item in pending_items:
|
||||
# Update attempt count
|
||||
cur.execute("""
|
||||
UPDATE processing_queue
|
||||
SET attempts = attempts + 1, last_attempt = CURRENT_TIMESTAMP
|
||||
WHERE id = %s
|
||||
""", (item['id'],))
|
||||
conn.commit()
|
||||
|
||||
# Process the URL
|
||||
result = scrape_and_check_arabic(item['url'], item['keyword'])
|
||||
|
||||
# Update processing status
|
||||
if result['has_arabic']:
|
||||
# Successfully processed and saved
|
||||
cur.execute("""
|
||||
UPDATE processing_queue
|
||||
SET status = 'completed'
|
||||
WHERE id = %s
|
||||
""", (item['id'],))
|
||||
|
||||
# Mark search_ingest as processed
|
||||
cur.execute("""
|
||||
UPDATE search_ingest
|
||||
SET processed = TRUE
|
||||
WHERE url_hash = %s
|
||||
""", (item['url_hash'],))
|
||||
else:
|
||||
# Failed or no Arabic content
|
||||
status = 'failed' if result['error'] else 'no_arabic'
|
||||
error_msg = result.get('error', 'No Arabic content found')
|
||||
|
||||
cur.execute("""
|
||||
UPDATE processing_queue
|
||||
SET status = %s, error_message = %s
|
||||
WHERE id = %s
|
||||
""", (status, error_msg, item['id']))
|
||||
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Queue processing failed: {e}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# --- API Endpoints ---
|
||||
@app.get("/")
|
||||
def health_check():
|
||||
"""Simple health check"""
|
||||
return {"status": "running", "redis_connected": redis_client is not None}
|
||||
conn = get_db_connection()
|
||||
db_status = conn is not None
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
@app.post("/api/ingest", status_code=status.HTTP_202_ACCEPTED)
|
||||
def ingest_search_data(payload: SearchPayload):
|
||||
"""
|
||||
Receives keywords + 10 URLs, validates them,
|
||||
and pushes them to the background queue.
|
||||
"""
|
||||
if not redis_client:
|
||||
raise HTTPException(status_code=503, detail="Queue service unavailable")
|
||||
|
||||
# 1. Validate (Pydantic handles basic validation)
|
||||
|
||||
# 2. Prepare the job payload for the worker
|
||||
# We structure it as a dictionary
|
||||
job_data = {
|
||||
"keyword": payload.keyword,
|
||||
"urls": payload.results,
|
||||
"count": len(payload.results)
|
||||
return {
|
||||
"status": "running",
|
||||
"database_connected": db_status,
|
||||
"service": "Arabic Search Indexer"
|
||||
}
|
||||
|
||||
# 3. Push to Redis Queue (LPUSH pushes to the left, so workers RPOP from right - FIFO logic)
|
||||
@app.post("/api/ingest", status_code=status.HTTP_202_ACCEPTED)
|
||||
def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Receives keywords + URLs, validates them,
|
||||
and stores them in PostgreSQL for processing.
|
||||
"""
|
||||
conn = get_db_connection()
|
||||
if not conn:
|
||||
raise HTTPException(status_code=503, detail="Database service unavailable")
|
||||
|
||||
try:
|
||||
# We convert dict to JSON string
|
||||
redis_client.lpush(QUEUE_NAME, json.dumps(job_data))
|
||||
inserted_count = 0
|
||||
|
||||
with conn.cursor() as cur:
|
||||
for url in payload.results:
|
||||
url_str = str(url)
|
||||
url_hash = get_url_hash(url_str)
|
||||
|
||||
try:
|
||||
# Insert into search_ingest table (ignore duplicates)
|
||||
cur.execute("""
|
||||
INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
|
||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
|
||||
ON CONFLICT (url_hash) DO NOTHING
|
||||
RETURNING id
|
||||
""", (payload.keyword, url_str, url_hash))
|
||||
|
||||
result = cur.fetchone()
|
||||
|
||||
if result:
|
||||
# Also add to processing queue
|
||||
cur.execute("""
|
||||
INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
|
||||
VALUES (%s, %s, %s, 'pending')
|
||||
ON CONFLICT (url_hash) DO NOTHING
|
||||
""", (result[0], url_str, url_hash))
|
||||
|
||||
inserted_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to insert URL {url_str}: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Trigger background processing
|
||||
if inserted_count > 0:
|
||||
background_tasks.add_task(process_urls_from_queue)
|
||||
|
||||
return {
|
||||
"message": "Success. Job queued for processing.",
|
||||
"message": f"Successfully queued {inserted_count} URLs for processing",
|
||||
"keyword": payload.keyword,
|
||||
"queued_urls": len(payload.results)
|
||||
"total_received": len(payload.results),
|
||||
"new_urls": inserted_count,
|
||||
"duplicates_skipped": len(payload.results) - inserted_count
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to queue job: {str(e)}")
|
||||
conn.rollback()
|
||||
logger.error(f"Failed to ingest data: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to store data: {str(e)}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.post("/api/scrape-and-check")
|
||||
def scrape_and_check_endpoint(url: HttpUrl):
|
||||
"""
|
||||
Endpoint to scrape a single URL and check for Arabic content
|
||||
"""
|
||||
url_str = str(url)
|
||||
result = scrape_and_check_arabic(url_str)
|
||||
|
||||
if result["error"]:
|
||||
raise HTTPException(status_code=400, detail=result["error"])
|
||||
|
||||
return result
|
||||
|
||||
@app.get("/api/arabic-index")
|
||||
def get_arabic_index(limit: int = 100, offset: int = 0):
|
||||
"""
|
||||
Retrieve entries from the Arabic index
|
||||
"""
|
||||
conn = get_db_connection()
|
||||
if not conn:
|
||||
raise HTTPException(status_code=503, detail="Database service unavailable")
|
||||
|
||||
try:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, url, title, meta_description,
|
||||
detection_score, scraped_at, source_keyword
|
||||
FROM arabic_index
|
||||
ORDER BY scraped_at DESC
|
||||
LIMIT %s OFFSET %s
|
||||
""", (limit, offset))
|
||||
|
||||
results = [dict(row) for row in cur.fetchall()]
|
||||
|
||||
# Get total count
|
||||
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
||||
total = cur.fetchone()[0]
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"results": results
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch arabic index: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to fetch data")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.post("/api/process-queue")
|
||||
def trigger_queue_processing(background_tasks: BackgroundTasks):
|
||||
"""
|
||||
Manually trigger queue processing
|
||||
"""
|
||||
background_tasks.add_task(process_urls_from_queue)
|
||||
return {"message": "Queue processing triggered"}
|
||||
|
||||
# --- Run with: uvicorn main:app --reload
|
||||
المرجع في مشكلة جديدة
حظر مستخدم