583 أسطر
20 KiB
Python
583 أسطر
20 KiB
Python
# main.py
|
|
import os
|
|
import re
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from fastapi import FastAPI, HTTPException, status, BackgroundTasks
|
|
from pydantic import BaseModel, Field, HttpUrl
|
|
from urllib.parse import urlparse
|
|
import hashlib
|
|
from typing import List, Optional
|
|
from datetime import datetime
|
|
import logging
|
|
|
|
# --- Configuration ---
|
|
DATABASE_URL = os.getenv("DATABASE_URL")
|
|
|
|
# --- Logging Setup ---
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- Models ---
|
|
class SearchPayload(BaseModel):
|
|
"""
|
|
Input model matching your description.
|
|
"""
|
|
keyword: str = Field(..., description="The search keyword used")
|
|
results: List[HttpUrl] = Field(..., description="List of URLs to process")
|
|
|
|
class ArabicContentResponse(BaseModel):
|
|
url: str
|
|
has_arabic: bool
|
|
title: Optional[str] = None
|
|
meta_description: Optional[str] = None
|
|
arabic_content_preview: Optional[str] = None
|
|
|
|
# --- App Setup ---
|
|
app = FastAPI(title="Arabic Search Ingestion API")
|
|
|
|
# --- Database Setup ---
|
|
def get_db_connection():
|
|
"""Create a database connection"""
|
|
try:
|
|
conn = psycopg2.connect(DATABASE_URL)
|
|
return conn
|
|
except Exception as e:
|
|
logger.error(f"Database connection failed: {e}")
|
|
return None
|
|
|
|
def init_database():
|
|
"""Initialize database tables if they don't exist"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
logger.error("Cannot initialize database - no connection")
|
|
return
|
|
|
|
try:
|
|
with conn.cursor() as cur:
|
|
# Create table for raw search data
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS search_ingest (
|
|
id SERIAL PRIMARY KEY,
|
|
keyword VARCHAR(500) NOT NULL,
|
|
url TEXT NOT NULL,
|
|
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
processed BOOLEAN DEFAULT FALSE
|
|
)
|
|
""")
|
|
|
|
# Create indexes for faster lookups
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_search_ingest_url_hash
|
|
ON search_ingest(url_hash)
|
|
""")
|
|
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_search_ingest_processed
|
|
ON search_ingest(processed)
|
|
""")
|
|
|
|
# Create table for Arabic content index
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS arabic_index (
|
|
id SERIAL PRIMARY KEY,
|
|
url TEXT NOT NULL UNIQUE,
|
|
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
|
title TEXT,
|
|
meta_description TEXT,
|
|
arabic_content TEXT,
|
|
detection_score FLOAT,
|
|
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
source_keyword VARCHAR(500),
|
|
http_status INTEGER
|
|
)
|
|
""")
|
|
|
|
# Create indexes for arabic_index
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_arabic_index_url_hash
|
|
ON arabic_index(url_hash)
|
|
""")
|
|
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_arabic_index_scraped_at
|
|
ON arabic_index(scraped_at)
|
|
""")
|
|
|
|
# Create table for processing queue
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS processing_queue (
|
|
id SERIAL PRIMARY KEY,
|
|
search_ingest_id INTEGER REFERENCES search_ingest(id) ON DELETE CASCADE,
|
|
url TEXT NOT NULL,
|
|
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
|
status VARCHAR(50) DEFAULT 'pending',
|
|
attempts INTEGER DEFAULT 0,
|
|
last_attempt TIMESTAMP,
|
|
error_message TEXT,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
""")
|
|
|
|
# Create indexes for processing_queue
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_processing_queue_status
|
|
ON processing_queue(status)
|
|
""")
|
|
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS idx_processing_queue_url_hash
|
|
ON processing_queue(url_hash)
|
|
""")
|
|
|
|
conn.commit()
|
|
logger.info("Database initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Database initialization failed: {e}")
|
|
conn.rollback()
|
|
finally:
|
|
conn.close()
|
|
|
|
# Initialize database on startup
|
|
@app.on_event("startup")
|
|
def startup_event():
|
|
init_database()
|
|
|
|
# --- Arabic Language Detection ---
|
|
def contains_arabic(text: str) -> bool:
|
|
"""
|
|
Check if text contains Arabic characters.
|
|
Arabic Unicode block: U+0600 to U+06FF
|
|
"""
|
|
if not text or not isinstance(text, str):
|
|
return False
|
|
|
|
# Arabic Unicode range pattern
|
|
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
|
|
return bool(arabic_pattern.search(text))
|
|
|
|
def extract_arabic_content(text: str, max_length: int = 500) -> str:
|
|
"""
|
|
Extract Arabic content preview from text
|
|
"""
|
|
if not text or not isinstance(text, str):
|
|
return ""
|
|
|
|
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
|
|
matches = arabic_pattern.findall(text)
|
|
|
|
if matches:
|
|
# Join matches and truncate
|
|
arabic_text = " ".join(matches)
|
|
return arabic_text[:max_length] + "..." if len(arabic_text) > max_length else arabic_text
|
|
|
|
return ""
|
|
|
|
def calculate_arabic_score(text: str) -> float:
|
|
"""
|
|
Calculate the percentage of Arabic characters in the text
|
|
"""
|
|
if not text or not isinstance(text, str) or len(text) == 0:
|
|
return 0.0
|
|
|
|
arabic_chars = len(re.findall(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]', text))
|
|
total_chars = len(text)
|
|
|
|
return (arabic_chars / total_chars) * 100 if total_chars > 0 else 0.0
|
|
|
|
def get_url_hash(url: str) -> str:
|
|
"""Create a unique hash for a URL"""
|
|
return hashlib.sha256(url.encode('utf-8')).hexdigest()
|
|
|
|
# --- Scraping Function ---
|
|
def scrape_and_check_arabic(url: str, keyword: str = None) -> dict:
|
|
"""
|
|
Scrape URL and check for Arabic content
|
|
"""
|
|
result = {
|
|
"url": url,
|
|
"has_arabic": False,
|
|
"title": None,
|
|
"meta_description": None,
|
|
"arabic_content_preview": None,
|
|
"arabic_score": 0.0,
|
|
"http_status": None,
|
|
"error": None
|
|
}
|
|
|
|
try:
|
|
# Set a user agent to avoid being blocked
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (compatible; ArabicIndexBot/1.0; +http://example.com/bot)'
|
|
}
|
|
|
|
# Make request with timeout
|
|
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
|
|
result["http_status"] = response.status_code
|
|
|
|
if response.status_code == 200:
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Extract title
|
|
title_tag = soup.find('title')
|
|
if title_tag:
|
|
result["title"] = title_tag.get_text().strip()
|
|
|
|
# Extract meta description
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
if meta_desc and meta_desc.get('content'):
|
|
result["meta_description"] = meta_desc['content'].strip()
|
|
|
|
# Extract main content (remove script, style tags)
|
|
for script in soup(["script", "style", "nav", "footer", "header"]):
|
|
script.decompose()
|
|
|
|
# Get text content
|
|
text_content = soup.get_text(separator=' ', strip=True)
|
|
|
|
# Check for Arabic
|
|
has_arabic_title = contains_arabic(result["title"])
|
|
has_arabic_meta = contains_arabic(result["meta_description"])
|
|
has_arabic_content = contains_arabic(text_content)
|
|
|
|
result["has_arabic"] = has_arabic_title or has_arabic_meta or has_arabic_content
|
|
|
|
if result["has_arabic"]:
|
|
# Calculate Arabic score
|
|
result["arabic_score"] = calculate_arabic_score(text_content)
|
|
|
|
# Extract Arabic content preview
|
|
arabic_preview = extract_arabic_content(text_content)
|
|
result["arabic_content_preview"] = arabic_preview
|
|
|
|
# Save to database
|
|
save_to_arabic_index(result, keyword)
|
|
|
|
except requests.Timeout:
|
|
result["error"] = "Request timeout"
|
|
except requests.RequestException as e:
|
|
result["error"] = f"Request failed: {str(e)}"
|
|
except Exception as e:
|
|
result["error"] = f"Scraping failed: {str(e)}"
|
|
|
|
return result
|
|
|
|
def save_to_arabic_index(data: dict, keyword: str = None):
|
|
"""Save Arabic content to the arabic_index table"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
logger.error("Cannot save to arabic_index - no database connection")
|
|
return
|
|
|
|
try:
|
|
url_hash = get_url_hash(data["url"])
|
|
|
|
with conn.cursor() as cur:
|
|
# Use INSERT ... ON CONFLICT to handle duplicates
|
|
cur.execute("""
|
|
INSERT INTO arabic_index
|
|
(url, url_hash, title, meta_description, arabic_content, detection_score, source_keyword, http_status, last_updated)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
|
|
ON CONFLICT (url_hash)
|
|
DO UPDATE SET
|
|
last_updated = CURRENT_TIMESTAMP,
|
|
title = EXCLUDED.title,
|
|
meta_description = EXCLUDED.meta_description,
|
|
arabic_content = EXCLUDED.arabic_content,
|
|
detection_score = EXCLUDED.detection_score,
|
|
http_status = EXCLUDED.http_status,
|
|
source_keyword = EXCLUDED.source_keyword
|
|
""", (
|
|
data["url"],
|
|
url_hash,
|
|
data["title"],
|
|
data["meta_description"],
|
|
data["arabic_content_preview"],
|
|
data["arabic_score"],
|
|
keyword,
|
|
data["http_status"]
|
|
))
|
|
conn.commit()
|
|
logger.info(f"Saved/Updated Arabic content for URL: {data['url']}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save to arabic_index: {e}")
|
|
conn.rollback()
|
|
finally:
|
|
conn.close()
|
|
|
|
# --- Background Processing ---
|
|
def process_urls_from_queue():
|
|
"""Process pending URLs from the queue"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
logger.error("Cannot process queue - no database connection")
|
|
return
|
|
|
|
try:
|
|
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
# Get pending URLs (limit 10 per batch)
|
|
cur.execute("""
|
|
SELECT pq.id, pq.url, pq.url_hash, si.keyword
|
|
FROM processing_queue pq
|
|
JOIN search_ingest si ON pq.search_ingest_id = si.id
|
|
WHERE pq.status = 'pending' AND pq.attempts < 3
|
|
ORDER BY pq.created_at ASC
|
|
LIMIT 10
|
|
FOR UPDATE SKIP LOCKED
|
|
""")
|
|
|
|
pending_items = cur.fetchall()
|
|
|
|
for item in pending_items:
|
|
# Update attempt count
|
|
cur.execute("""
|
|
UPDATE processing_queue
|
|
SET attempts = attempts + 1, last_attempt = CURRENT_TIMESTAMP
|
|
WHERE id = %s
|
|
""", (item['id'],))
|
|
conn.commit()
|
|
|
|
# Process the URL
|
|
result = scrape_and_check_arabic(item['url'], item['keyword'])
|
|
|
|
# Update processing status
|
|
if result['has_arabic']:
|
|
# Successfully processed and saved
|
|
cur.execute("""
|
|
UPDATE processing_queue
|
|
SET status = 'completed'
|
|
WHERE id = %s
|
|
""", (item['id'],))
|
|
|
|
# Mark search_ingest as processed
|
|
cur.execute("""
|
|
UPDATE search_ingest
|
|
SET processed = TRUE
|
|
WHERE url_hash = %s
|
|
""", (item['url_hash'],))
|
|
else:
|
|
# Failed or no Arabic content
|
|
status = 'failed' if result['error'] else 'no_arabic'
|
|
error_msg = result.get('error', 'No Arabic content found')
|
|
|
|
cur.execute("""
|
|
UPDATE processing_queue
|
|
SET status = %s, error_message = %s
|
|
WHERE id = %s
|
|
""", (status, error_msg, item['id']))
|
|
|
|
conn.commit()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Queue processing failed: {e}")
|
|
conn.rollback()
|
|
finally:
|
|
conn.close()
|
|
|
|
# --- API Endpoints ---
|
|
@app.get("/")
|
|
def health_check():
|
|
"""Simple health check"""
|
|
conn = get_db_connection()
|
|
db_status = conn is not None
|
|
if conn:
|
|
conn.close()
|
|
|
|
return {
|
|
"status": "running",
|
|
"database_connected": db_status,
|
|
"service": "Arabic Search Indexer"
|
|
}
|
|
|
|
@app.post("/api/ingest", status_code=status.HTTP_202_ACCEPTED)
|
|
def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks):
|
|
"""
|
|
Receives keywords + URLs, validates them,
|
|
and stores them in PostgreSQL for processing.
|
|
"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
raise HTTPException(status_code=503, detail="Database service unavailable")
|
|
|
|
try:
|
|
inserted_count = 0
|
|
duplicate_count = 0
|
|
|
|
with conn.cursor() as cur:
|
|
for url in payload.results:
|
|
url_str = str(url)
|
|
url_hash = get_url_hash(url_str)
|
|
|
|
try:
|
|
# First check if URL already exists in search_ingest
|
|
cur.execute("""
|
|
SELECT id FROM search_ingest WHERE url_hash = %s
|
|
""", (url_hash,))
|
|
|
|
existing = cur.fetchone()
|
|
|
|
if not existing:
|
|
# Insert into search_ingest table
|
|
cur.execute("""
|
|
INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
|
|
VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
|
|
RETURNING id
|
|
""", (payload.keyword, url_str, url_hash))
|
|
|
|
result = cur.fetchone()
|
|
|
|
if result:
|
|
# Also add to processing queue
|
|
cur.execute("""
|
|
INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
|
|
VALUES (%s, %s, %s, 'pending')
|
|
""", (result[0], url_str, url_hash))
|
|
|
|
inserted_count += 1
|
|
else:
|
|
duplicate_count += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to insert URL {url_str}: {e}")
|
|
conn.rollback() # Rollback the failed transaction
|
|
continue
|
|
|
|
conn.commit()
|
|
|
|
# Trigger background processing
|
|
if inserted_count > 0:
|
|
background_tasks.add_task(process_urls_from_queue)
|
|
|
|
return {
|
|
"message": f"Successfully queued {inserted_count} URLs for processing",
|
|
"keyword": payload.keyword,
|
|
"total_received": len(payload.results),
|
|
"new_urls": inserted_count,
|
|
"duplicates_skipped": duplicate_count
|
|
}
|
|
|
|
except Exception as e:
|
|
conn.rollback()
|
|
logger.error(f"Failed to ingest data: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Failed to store data: {str(e)}")
|
|
finally:
|
|
conn.close()
|
|
|
|
@app.post("/api/scrape-and-check")
|
|
def scrape_and_check_endpoint(url: HttpUrl):
|
|
"""
|
|
Endpoint to scrape a single URL and check for Arabic content
|
|
"""
|
|
url_str = str(url)
|
|
result = scrape_and_check_arabic(url_str)
|
|
|
|
if result["error"]:
|
|
raise HTTPException(status_code=400, detail=result["error"])
|
|
|
|
return result
|
|
|
|
@app.get("/api/arabic-index")
|
|
def get_arabic_index(limit: int = 100, offset: int = 0):
|
|
"""
|
|
Retrieve entries from the Arabic index
|
|
"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
raise HTTPException(status_code=503, detail="Database service unavailable")
|
|
|
|
try:
|
|
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT id, url, title, meta_description,
|
|
detection_score, scraped_at, source_keyword
|
|
FROM arabic_index
|
|
ORDER BY scraped_at DESC
|
|
LIMIT %s OFFSET %s
|
|
""", (limit, offset))
|
|
|
|
results = []
|
|
for row in cur.fetchall():
|
|
results.append({
|
|
'id': row[0],
|
|
'url': row[1],
|
|
'title': row[2],
|
|
'meta_description': row[3],
|
|
'detection_score': row[4],
|
|
'scraped_at': row[5].isoformat() if row[5] else None,
|
|
'source_keyword': row[6]
|
|
})
|
|
|
|
# Get total count
|
|
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
|
total = cur.fetchone()[0]
|
|
|
|
return {
|
|
"total": total,
|
|
"offset": offset,
|
|
"limit": limit,
|
|
"results": results
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch arabic index: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to fetch data")
|
|
finally:
|
|
conn.close()
|
|
|
|
@app.post("/api/process-queue")
|
|
def trigger_queue_processing(background_tasks: BackgroundTasks):
|
|
"""
|
|
Manually trigger queue processing
|
|
"""
|
|
background_tasks.add_task(process_urls_from_queue)
|
|
return {"message": "Queue processing triggered"}
|
|
|
|
@app.get("/api/stats")
|
|
def get_stats():
|
|
"""
|
|
Get statistics about the index
|
|
"""
|
|
conn = get_db_connection()
|
|
if not conn:
|
|
raise HTTPException(status_code=503, detail="Database service unavailable")
|
|
|
|
try:
|
|
with conn.cursor() as cur:
|
|
# Get total URLs ingested
|
|
cur.execute("SELECT COUNT(*) FROM search_ingest")
|
|
total_ingested = cur.fetchone()[0]
|
|
|
|
# Get processed URLs
|
|
cur.execute("SELECT COUNT(*) FROM search_ingest WHERE processed = TRUE")
|
|
total_processed = cur.fetchone()[0]
|
|
|
|
# Get Arabic index count
|
|
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
|
total_arabic = cur.fetchone()[0]
|
|
|
|
# Get queue stats
|
|
cur.execute("""
|
|
SELECT status, COUNT(*)
|
|
FROM processing_queue
|
|
GROUP BY status
|
|
""")
|
|
queue_stats = {row[0]: row[1] for row in cur.fetchall()}
|
|
|
|
return {
|
|
"total_urls_ingested": total_ingested,
|
|
"total_urls_processed": total_processed,
|
|
"total_arabic_pages": total_arabic,
|
|
"queue_status": queue_stats,
|
|
"processing_rate": f"{(total_processed/total_ingested*100):.1f}%" if total_ingested > 0 else "0%"
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Failed to get stats: {e}")
|
|
raise HTTPException(status_code=500, detail="Failed to get statistics")
|
|
finally:
|
|
conn.close()
|
|
|
|
# --- Run with: uvicorn main:app --reload --host 0.0.0.0 --port 8000 |