PDFSummerizorV1/app2.py

import os
import uvicorn
import requests
from openai import OpenAI
import PyPDF2
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.responses import JSONResponse

# Load environment variables
load_dotenv()

# --- Configuration ---
GITPASHA_HOST = "https://rag-app-fa66b3d8eb83.hosted.ghaymah.systems"

# Initialize FastAPI app
app = FastAPI(
    title="Remote PDF Summarizer API",
    description="Upload a PDF and get a summary using a remote RAG pipeline.",
    version="2.1.0"
)

# Client for final summarization
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
    base_url="https://genai.ghaymah.systems"
)

# Use a local embedding model that matches the remote vector store's expected dimension
print("Initializing local embedding model (jinaai/jina-embeddings-v2-small-en)...")
embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en")
print("Embedding model loaded.")

# --- Helper Functions ---

def extract_text_from_pdf(pdf_stream):
    """Extracts text from a PDF file stream and cleans it."""
    print("Extracting text from PDF stream...")
    text = ""
    try:
        reader = PyPDF2.PdfReader(pdf_stream)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        print("Text extraction complete.")
        # Clean the extracted text for pure plain text
        text = ' '.join(text.split())
        return text
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to read PDF content: {e}")

def store_text_chunks_remote(text):
    """Splits text, creates embeddings, and stores them in the remote GitPasha vector store."""
    if not text:
        print("Skipping storage: No text provided.")
        return False
    print("Splitting text into chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = splitter.split_text(text)

    print(f"Creating embeddings for {len(chunks)} chunks...")
    try:
        # Ensure the embedding model produces 512-dim vectors
        chunk_vectors = embeddings.embed_documents(chunks)
        payloads = [{"text_chunk": chunk} for chunk in chunks]
    except Exception as e:
        print(f"Failed to create embeddings: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to create text embeddings: {e}")

    print("Uploading vectors and payloads to remote GitPasha vector store...")
    try:
        response = requests.post(
            f"{GITPASHA_HOST}/insert",
            json={"vectors": chunk_vectors, "payloads": payloads},
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        print(f"→ POST /insert: {response.status_code}")
        if response.status_code == 200:
            print("Upload complete ✅")
            return True
        else:
            raise HTTPException(status_code=response.status_code, detail=f"Failed to insert data into remote vector store: {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the remote /insert API: {e}")
        raise HTTPException(status_code=500, detail=f"Error connecting to remote vector store: {e}")

def get_summary_from_remote_rag(
    query: str,
    model: str = "DeepSeek-V3-0324"
):
    """Creates a query embedding, searches remote GitPasha, and summarizes."""
    print(f"Creating embedding for query: '{query}'")
    try:
        query_vector = embeddings.embed_query(query)
    except Exception as e:
        print(f"Failed to create query embedding: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to create query embedding: {e}")

    print("Retrieving relevant context from remote GitPasha vector store...")
    try:
        response = requests.post(
            f"{GITPASHA_HOST}/search",
            json={"vector": query_vector, "k": 4},
            headers={"Content-Type": "application/json"}
        )
        response.raise_for_status()
        search_results = response.json()
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the remote /search API: {e}")
        raise HTTPException(status_code=500, detail=f"Error searching remote vector store: {e}")

    if not search_results or 'results' not in search_results or not search_results['results']:
        print("No relevant context found.")
        return "Could not find any relevant context to generate a summary for the query."

    context = "\n\n".join([result['payload']['text_chunk'] for result in search_results['results']])

    # Generate the final summary using the remote LLM
    print("Generating final summary using remote LLM...")
    try:
        completion_response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that summarizes documents based on the provided context."},
                {"role": "user", "content": f"Based on the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {query}"}
            ]
        )
        return completion_response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred during final summarization: {e}")
        raise HTTPException(status_code=500, detail=f"Failed to generate summary from AI model: {e}")

# --- API Endpoints ---

@app.post("/summarize/")
async def summarize_pdf(
    file: UploadFile = File(...),
    query: str = Form("Summarize the key points of this document.")
):
    """
    Accepts a PDF file and a query, then returns a summary.
    """
    if file.content_type != "application/pdf":
        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")

    try:
        # Extract text from the uploaded PDF file stream
        pdf_text = extract_text_from_pdf(file.file)

        # Store the text chunks and their embeddings in the remote vector store
        if not store_text_chunks_remote(pdf_text):
            raise HTTPException(status_code=500, detail="Failed to process and store the document in remote vector store.")

        # Query, retrieve context, and generate the summary using remote RAG
        summary = get_summary_from_remote_rag(query)

        return JSONResponse(content={"summary": summary})

    except HTTPException as e:
        # Re-raise HTTPException to be handled by FastAPI
        raise e
    except Exception as e:
        # Catch any other unexpected errors
        print(f"An unexpected error occurred: {e}")
        raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}")

@app.get("/")
def read_root():
    return {"message": "Welcome to the Remote PDF Summarizer API. Use /docs for documentation."}

# --- Main execution ---

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)