import os import uvicorn import requests from openai import OpenAI import PyPDF2 from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from fastapi import FastAPI, UploadFile, File, Form, HTTPException from fastapi.responses import JSONResponse # Load environment variables load_dotenv() # --- Configuration --- GITPASHA_HOST = "https://rag-app-fa66b3d8eb83.hosted.ghaymah.systems" # Initialize FastAPI app app = FastAPI( title="Remote PDF Summarizer API", description="Upload a PDF and get a summary using a remote RAG pipeline.", version="2.1.0" ) # Client for final summarization client = OpenAI( api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://genai.ghaymah.systems" ) # Use a local embedding model that matches the remote vector store's expected dimension print("Initializing local embedding model (jinaai/jina-embeddings-v2-small-en)...") embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en") print("Embedding model loaded.") # --- Helper Functions --- def extract_text_from_pdf(pdf_stream): """Extracts text from a PDF file stream and cleans it.""" print("Extracting text from PDF stream...") text = "" try: reader = PyPDF2.PdfReader(pdf_stream) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text print("Text extraction complete.") # Clean the extracted text for pure plain text text = ' '.join(text.split()) return text except Exception as e: print(f"An error occurred while reading the PDF: {e}") raise HTTPException(status_code=500, detail=f"Failed to read PDF content: {e}") def store_text_chunks_remote(text): """Splits text, creates embeddings, and stores them in the remote GitPasha vector store.""" if not text: print("Skipping storage: No text provided.") return False print("Splitting text into chunks...") splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = splitter.split_text(text) print(f"Creating embeddings for {len(chunks)} chunks...") try: # Ensure the embedding model produces 512-dim vectors chunk_vectors = embeddings.embed_documents(chunks) payloads = [{"text_chunk": chunk} for chunk in chunks] except Exception as e: print(f"Failed to create embeddings: {e}") raise HTTPException(status_code=500, detail=f"Failed to create text embeddings: {e}") print("Uploading vectors and payloads to remote GitPasha vector store...") try: response = requests.post( f"{GITPASHA_HOST}/insert", json={"vectors": chunk_vectors, "payloads": payloads}, headers={"Content-Type": "application/json"} ) response.raise_for_status() print(f"→ POST /insert: {response.status_code}") if response.status_code == 200: print("Upload complete ✅") return True else: raise HTTPException(status_code=response.status_code, detail=f"Failed to insert data into remote vector store: {response.text}") except requests.exceptions.RequestException as e: print(f"An error occurred while calling the remote /insert API: {e}") raise HTTPException(status_code=500, detail=f"Error connecting to remote vector store: {e}") def get_summary_from_remote_rag( query: str, model: str = "DeepSeek-V3-0324" ): """Creates a query embedding, searches remote GitPasha, and summarizes.""" print(f"Creating embedding for query: '{query}'") try: query_vector = embeddings.embed_query(query) except Exception as e: print(f"Failed to create query embedding: {e}") raise HTTPException(status_code=500, detail=f"Failed to create query embedding: {e}") print("Retrieving relevant context from remote GitPasha vector store...") try: response = requests.post( f"{GITPASHA_HOST}/search", json={"vector": query_vector, "k": 4}, headers={"Content-Type": "application/json"} ) response.raise_for_status() search_results = response.json() except requests.exceptions.RequestException as e: print(f"An error occurred while calling the remote /search API: {e}") raise HTTPException(status_code=500, detail=f"Error searching remote vector store: {e}") if not search_results or 'results' not in search_results or not search_results['results']: print("No relevant context found.") return "Could not find any relevant context to generate a summary for the query." context = "\n\n".join([result['payload']['text_chunk'] for result in search_results['results']]) # Generate the final summary using the remote LLM print("Generating final summary using remote LLM...") try: completion_response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a helpful assistant that summarizes documents based on the provided context."}, {"role": "user", "content": f"Based on the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {query}"} ] ) return completion_response.choices[0].message.content except Exception as e: print(f"An error occurred during final summarization: {e}") raise HTTPException(status_code=500, detail=f"Failed to generate summary from AI model: {e}") # --- API Endpoints --- @app.post("/summarize/") async def summarize_pdf( file: UploadFile = File(...), query: str = Form("Summarize the key points of this document.") ): """ Accepts a PDF file and a query, then returns a summary. """ if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.") try: # Extract text from the uploaded PDF file stream pdf_text = extract_text_from_pdf(file.file) # Store the text chunks and their embeddings in the remote vector store if not store_text_chunks_remote(pdf_text): raise HTTPException(status_code=500, detail="Failed to process and store the document in remote vector store.") # Query, retrieve context, and generate the summary using remote RAG summary = get_summary_from_remote_rag(query) return JSONResponse(content={"summary": summary}) except HTTPException as e: # Re-raise HTTPException to be handled by FastAPI raise e except Exception as e: # Catch any other unexpected errors print(f"An unexpected error occurred: {e}") raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}") @app.get("/") def read_root(): return {"message": "Welcome to the Remote PDF Summarizer API. Use /docs for documentation."} # --- Main execution --- if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)