Update main.py
هذا الالتزام موجود في:
145
main.py
145
main.py
@@ -14,7 +14,7 @@ from datetime import datetime
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
# --- Configuration ---
|
# --- Configuration ---
|
||||||
DATABASE_URL = os.getenv("DATABASE_URL", "")
|
DATABASE_URL = os.getenv("DATABASE_URL")
|
||||||
|
|
||||||
# --- Logging Setup ---
|
# --- Logging Setup ---
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@@ -63,14 +63,13 @@ def init_database():
|
|||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
keyword VARCHAR(500) NOT NULL,
|
keyword VARCHAR(500) NOT NULL,
|
||||||
url TEXT NOT NULL,
|
url TEXT NOT NULL,
|
||||||
url_hash VARCHAR(64) NOT NULL,
|
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
processed BOOLEAN DEFAULT FALSE,
|
processed BOOLEAN DEFAULT FALSE
|
||||||
UNIQUE(url_hash)
|
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# Create index for faster lookups
|
# Create indexes for faster lookups
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
CREATE INDEX IF NOT EXISTS idx_search_ingest_url_hash
|
CREATE INDEX IF NOT EXISTS idx_search_ingest_url_hash
|
||||||
ON search_ingest(url_hash)
|
ON search_ingest(url_hash)
|
||||||
@@ -94,27 +93,47 @@ def init_database():
|
|||||||
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
source_keyword VARCHAR(500),
|
source_keyword VARCHAR(500),
|
||||||
http_status INTEGER,
|
http_status INTEGER
|
||||||
INDEX(url_hash)
|
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
# Create indexes for arabic_index
|
||||||
|
cur.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_arabic_index_url_hash
|
||||||
|
ON arabic_index(url_hash)
|
||||||
|
""")
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_arabic_index_scraped_at
|
||||||
|
ON arabic_index(scraped_at)
|
||||||
|
""")
|
||||||
|
|
||||||
# Create table for processing queue
|
# Create table for processing queue
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS processing_queue (
|
CREATE TABLE IF NOT EXISTS processing_queue (
|
||||||
id SERIAL PRIMARY KEY,
|
id SERIAL PRIMARY KEY,
|
||||||
search_ingest_id INTEGER REFERENCES search_ingest(id),
|
search_ingest_id INTEGER REFERENCES search_ingest(id) ON DELETE CASCADE,
|
||||||
url TEXT NOT NULL,
|
url TEXT NOT NULL,
|
||||||
url_hash VARCHAR(64) NOT NULL,
|
url_hash VARCHAR(64) NOT NULL UNIQUE,
|
||||||
status VARCHAR(50) DEFAULT 'pending',
|
status VARCHAR(50) DEFAULT 'pending',
|
||||||
attempts INTEGER DEFAULT 0,
|
attempts INTEGER DEFAULT 0,
|
||||||
last_attempt TIMESTAMP,
|
last_attempt TIMESTAMP,
|
||||||
error_message TEXT,
|
error_message TEXT,
|
||||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
UNIQUE(url_hash)
|
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
# Create indexes for processing_queue
|
||||||
|
cur.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_processing_queue_status
|
||||||
|
ON processing_queue(status)
|
||||||
|
""")
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_processing_queue_url_hash
|
||||||
|
ON processing_queue(url_hash)
|
||||||
|
""")
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
logger.info("Database initialized successfully")
|
logger.info("Database initialized successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -134,7 +153,7 @@ def contains_arabic(text: str) -> bool:
|
|||||||
Check if text contains Arabic characters.
|
Check if text contains Arabic characters.
|
||||||
Arabic Unicode block: U+0600 to U+06FF
|
Arabic Unicode block: U+0600 to U+06FF
|
||||||
"""
|
"""
|
||||||
if not text:
|
if not text or not isinstance(text, str):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Arabic Unicode range pattern
|
# Arabic Unicode range pattern
|
||||||
@@ -145,7 +164,7 @@ def extract_arabic_content(text: str, max_length: int = 500) -> str:
|
|||||||
"""
|
"""
|
||||||
Extract Arabic content preview from text
|
Extract Arabic content preview from text
|
||||||
"""
|
"""
|
||||||
if not text:
|
if not text or not isinstance(text, str):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
|
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
|
||||||
@@ -162,7 +181,7 @@ def calculate_arabic_score(text: str) -> float:
|
|||||||
"""
|
"""
|
||||||
Calculate the percentage of Arabic characters in the text
|
Calculate the percentage of Arabic characters in the text
|
||||||
"""
|
"""
|
||||||
if not text or len(text) == 0:
|
if not text or not isinstance(text, str) or len(text) == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
arabic_chars = len(re.findall(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]', text))
|
arabic_chars = len(re.findall(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]', text))
|
||||||
@@ -387,6 +406,7 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
inserted_count = 0
|
inserted_count = 0
|
||||||
|
duplicate_count = 0
|
||||||
|
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
for url in payload.results:
|
for url in payload.results:
|
||||||
@@ -394,28 +414,37 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
|
|||||||
url_hash = get_url_hash(url_str)
|
url_hash = get_url_hash(url_str)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Insert into search_ingest table (ignore duplicates)
|
# First check if URL already exists in search_ingest
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
|
SELECT id FROM search_ingest WHERE url_hash = %s
|
||||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
|
""", (url_hash,))
|
||||||
ON CONFLICT (url_hash) DO NOTHING
|
|
||||||
RETURNING id
|
|
||||||
""", (payload.keyword, url_str, url_hash))
|
|
||||||
|
|
||||||
result = cur.fetchone()
|
existing = cur.fetchone()
|
||||||
|
|
||||||
if result:
|
if not existing:
|
||||||
# Also add to processing queue
|
# Insert into search_ingest table
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
|
INSERT INTO search_ingest (keyword, url, url_hash, created_at, processed)
|
||||||
VALUES (%s, %s, %s, 'pending')
|
VALUES (%s, %s, %s, CURRENT_TIMESTAMP, FALSE)
|
||||||
ON CONFLICT (url_hash) DO NOTHING
|
RETURNING id
|
||||||
""", (result[0], url_str, url_hash))
|
""", (payload.keyword, url_str, url_hash))
|
||||||
|
|
||||||
inserted_count += 1
|
result = cur.fetchone()
|
||||||
|
|
||||||
|
if result:
|
||||||
|
# Also add to processing queue
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO processing_queue (search_ingest_id, url, url_hash, status)
|
||||||
|
VALUES (%s, %s, %s, 'pending')
|
||||||
|
""", (result[0], url_str, url_hash))
|
||||||
|
|
||||||
|
inserted_count += 1
|
||||||
|
else:
|
||||||
|
duplicate_count += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to insert URL {url_str}: {e}")
|
logger.error(f"Failed to insert URL {url_str}: {e}")
|
||||||
|
conn.rollback() # Rollback the failed transaction
|
||||||
continue
|
continue
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
@@ -429,7 +458,7 @@ def ingest_search_data(payload: SearchPayload, background_tasks: BackgroundTasks
|
|||||||
"keyword": payload.keyword,
|
"keyword": payload.keyword,
|
||||||
"total_received": len(payload.results),
|
"total_received": len(payload.results),
|
||||||
"new_urls": inserted_count,
|
"new_urls": inserted_count,
|
||||||
"duplicates_skipped": len(payload.results) - inserted_count
|
"duplicates_skipped": duplicate_count
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -471,7 +500,17 @@ def get_arabic_index(limit: int = 100, offset: int = 0):
|
|||||||
LIMIT %s OFFSET %s
|
LIMIT %s OFFSET %s
|
||||||
""", (limit, offset))
|
""", (limit, offset))
|
||||||
|
|
||||||
results = [dict(row) for row in cur.fetchall()]
|
results = []
|
||||||
|
for row in cur.fetchall():
|
||||||
|
results.append({
|
||||||
|
'id': row[0],
|
||||||
|
'url': row[1],
|
||||||
|
'title': row[2],
|
||||||
|
'meta_description': row[3],
|
||||||
|
'detection_score': row[4],
|
||||||
|
'scraped_at': row[5].isoformat() if row[5] else None,
|
||||||
|
'source_keyword': row[6]
|
||||||
|
})
|
||||||
|
|
||||||
# Get total count
|
# Get total count
|
||||||
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
||||||
@@ -497,4 +536,48 @@ def trigger_queue_processing(background_tasks: BackgroundTasks):
|
|||||||
background_tasks.add_task(process_urls_from_queue)
|
background_tasks.add_task(process_urls_from_queue)
|
||||||
return {"message": "Queue processing triggered"}
|
return {"message": "Queue processing triggered"}
|
||||||
|
|
||||||
# --- Run with: uvicorn main:app --reload
|
@app.get("/api/stats")
|
||||||
|
def get_stats():
|
||||||
|
"""
|
||||||
|
Get statistics about the index
|
||||||
|
"""
|
||||||
|
conn = get_db_connection()
|
||||||
|
if not conn:
|
||||||
|
raise HTTPException(status_code=503, detail="Database service unavailable")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
# Get total URLs ingested
|
||||||
|
cur.execute("SELECT COUNT(*) FROM search_ingest")
|
||||||
|
total_ingested = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Get processed URLs
|
||||||
|
cur.execute("SELECT COUNT(*) FROM search_ingest WHERE processed = TRUE")
|
||||||
|
total_processed = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Get Arabic index count
|
||||||
|
cur.execute("SELECT COUNT(*) FROM arabic_index")
|
||||||
|
total_arabic = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Get queue stats
|
||||||
|
cur.execute("""
|
||||||
|
SELECT status, COUNT(*)
|
||||||
|
FROM processing_queue
|
||||||
|
GROUP BY status
|
||||||
|
""")
|
||||||
|
queue_stats = {row[0]: row[1] for row in cur.fetchall()}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_urls_ingested": total_ingested,
|
||||||
|
"total_urls_processed": total_processed,
|
||||||
|
"total_arabic_pages": total_arabic,
|
||||||
|
"queue_status": queue_stats,
|
||||||
|
"processing_rate": f"{(total_processed/total_ingested*100):.1f}%" if total_ingested > 0 else "0%"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get stats: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to get statistics")
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# --- Run with: uvicorn main:app --reload --host 0.0.0.0 --port 8000
|
||||||
المرجع في مشكلة جديدة
حظر مستخدم