From 10e14509082d4eefb084f484c45a1ae45e418881 Mon Sep 17 00:00:00 2001
From: ghaymah_dev
Date: Sun, 15 Mar 2026 18:44:03 +0000
Subject: [PATCH] Update main.py
---
main.py | 145 ++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 114 insertions(+), 31 deletions(-)
diff --git a/main.py b/main.py
index 85ce26a..5f58afe 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,7 @@ from datetime import datetime
import logging
# --- Configuration ---
-DATABASE_URL = os.getenv("DATABASE_URL", "")
+DATABASE_URL = os.getenv("DATABASE_URL")
# --- Logging Setup ---
logging.basicConfig(level=logging.INFO)
@@ -63,14 +63,13 @@ def init_database():
id SERIAL PRIMARY KEY,
keyword VARCHAR(500) NOT NULL,
url TEXT NOT NULL,
- url_hash VARCHAR(64) NOT NULL,
+ url_hash VARCHAR(64) NOT NULL UNIQUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- processed BOOLEAN DEFAULT FALSE,
- UNIQUE(url_hash)
+ processed BOOLEAN DEFAULT FALSE
)
""")
- # Create index for faster lookups
+ # Create indexes for faster lookups
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_search_ingest_url_hash
ON search_ingest(url_hash)
@@ -94,27 +93,47 @@ def init_database():
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
source_keyword VARCHAR(500),
- http_status INTEGER,
- INDEX(url_hash)
+ http_status INTEGER
)
""")
+ # Create indexes for arabic_index
+ cur.execute("""
+ CREATE INDEX IF NOT EXISTS idx_arabic_index_url_hash
+ ON arabic_index(url_hash)
+ """)
+
+ cur.execute("""
+ CREATE INDEX IF NOT EXISTS idx_arabic_index_scraped_at
+ ON arabic_index(scraped_at)
+ """)
+
# Create table for processing queue
cur.execute("""
CREATE TABLE IF NOT EXISTS processing_queue (
id SERIAL PRIMARY KEY,
- search_ingest_id INTEGER REFERENCES search_ingest(id),
+ search_ingest_id INTEGER REFERENCES search_ingest(id) ON DELETE CASCADE,
url TEXT NOT NULL,
- url_hash VARCHAR(64) NOT NULL,
+ url_hash VARCHAR(64) NOT NULL UNIQUE,
status VARCHAR(50) DEFAULT 'pending',
attempts INTEGER DEFAULT 0,
last_attempt TIMESTAMP,
error_message TEXT,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- UNIQUE(url_hash)
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
+ # Create indexes for processing_queue
+ cur.execute("""
+ CREATE INDEX IF NOT EXISTS idx_processing_queue_status
+ ON processing_queue(status)
+ """)
+
+ cur.execute("""
+ CREATE INDEX IF NOT EXISTS idx_processing_queue_url_hash
+ ON processing_queue(url_hash)
+ """)
+
conn.commit()
logger.info("Database initialized successfully")
except Exception as e:
@@ -134,7 +153,7 @@ def contains_arabic(text: str) -> bool:
Check if text contains Arabic characters.
Arabic Unicode block: U+0600 to U+06FF
"""
- if not text:
+ if not text or not isinstance(text, str):
return False
# Arabic Unicode range pattern
@@ -145,7 +164,7 @@ def extract_arabic_content(text: str, max_length: int = 500) -> str:
"""
Extract Arabic content preview from text
"""
- if not text:
+ if not text or not isinstance(text, str):
return ""
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
@@ -162,7 +181,7 @@ def calculate_arabic_score(text: str) -> float:
"""
Calculate the percentage of Arabic characters in the text
"""
- if not text or len(text) == 0:
+ if not text or not isinstance(text, str) or len(text) == 0:
return 0.0
arabic_chars = len(re.findall(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]', text))
@@ -387,6 +406,7 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
try:
inserted_count = 0
+ duplicate_count = 0
with conn.cursor() as cur:
for url in payload.results:
@@ -394,28 +414,37 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
url_hash = get_url_hash(url_str)
try:
- # Insert into search_ingest table (ignore duplicates)
+ # First check if URL already exists in search_ingest
cur.execute("""
- INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
- VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
- ON CONFLICT (url_hash) DO NOTHING
- RETURNING id
- """, (payload.keyword, url_str, url_hash))
+ SELECT id FROM search_ingest WHERE url_hash = %s
+ """, (url_hash,))
- result = cur.fetchone()
+ existing = cur.fetchone()
- if result:
- # Also add to processing queue
+ if not existing:
+ # Insert into search_ingest table
cur.execute("""
- INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
- VALUES (%s, %s, %s, 'pending')
- ON CONFLICT (url_hash) DO NOTHING
- """, (result[0], url_str, url_hash))
+ INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
+ VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
+ RETURNING id
+ """, (payload.keyword, url_str, url_hash))
- inserted_count += 1
+ result = cur.fetchone()
+
+ if result:
+ # Also add to processing queue
+ cur.execute("""
+ INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
+ VALUES (%s, %s, %s, 'pending')
+ """, (result[0], url_str, url_hash))
+
+ inserted_count += 1
+ else:
+ duplicate_count += 1
except Exception as e:
logger.error(f"Failed to insert URL {url_str}: {e}")
+ conn.rollback() # Rollback the failed transaction
continue
conn.commit()
@@ -429,7 +458,7 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
"keyword": payload.keyword,
"total_received": len(payload.results),
"new_urls": inserted_count,
- "duplicates_skipped": len(payload.results) - inserted_count
+ "duplicates_skipped": duplicate_count
}
except Exception as e:
@@ -471,7 +500,17 @@ def get_arabic_index(limit: int = 100, offset: int = 0):
LIMIT %s OFFSET %s
""", (limit, offset))
- results = [dict(row) for row in cur.fetchall()]
+ results = []
+ for row in cur.fetchall():
+ results.append({
+ 'id': row[0],
+ 'url': row[1],
+ 'title': row[2],
+ 'meta_description': row[3],
+ 'detection_score': row[4],
+ 'scraped_at': row[5].isoformat() if row[5] else None,
+ 'source_keyword': row[6]
+ })
# Get total count
cur.execute("SELECT COUNT(*) FROM arabic_index")
@@ -497,4 +536,48 @@ def trigger_queue_processing(background_tasks: BackgroundTasks):
background_tasks.add_task(process_urls_from_queue)
return {"message": "Queue processing triggered"}
-# --- Run with: uvicorn main:app --reload
\ No newline at end of file
+@app.get("/api/stats")
+def get_stats():
+ """
+ Get statistics about the index
+ """
+ conn = get_db_connection()
+ if not conn:
+ raise HTTPException(status_code=503, detail="Database service unavailable")
+
+ try:
+ with conn.cursor() as cur:
+ # Get total URLs ingested
+ cur.execute("SELECT COUNT(*) FROM search_ingest")
+ total_ingested = cur.fetchone()[0]
+
+ # Get processed URLs
+ cur.execute("SELECT COUNT(*) FROM search_ingest WHERE processed = TRUE")
+ total_processed = cur.fetchone()[0]
+
+ # Get Arabic index count
+ cur.execute("SELECT COUNT(*) FROM arabic_index")
+ total_arabic = cur.fetchone()[0]
+
+ # Get queue stats
+ cur.execute("""
+ SELECT status, COUNT(*)
+ FROM processing_queue
+ GROUP BY status
+ """)
+ queue_stats = {row[0]: row[1] for row in cur.fetchall()}
+
+ return {
+ "total_urls_ingested": total_ingested,
+ "total_urls_processed": total_processed,
+ "total_arabic_pages": total_arabic,
+ "queue_status": queue_stats,
+ "processing_rate": f"{(total_processed/total_ingested*100):.1f}%" if total_ingested > 0 else "0%"
+ }
+ except Exception as e:
+ logger.error(f"Failed to get stats: {e}")
+ raise HTTPException(status_code=500, detail="Failed to get statistics")
+ finally:
+ conn.close()
+
+# --- Run with: uvicorn main:app --reload --host 0.0.0.0 --port 8000
\ No newline at end of file