From bbc0180a426daf980107fc3a1b7ae3e72fafa54b Mon Sep 17 00:00:00 2001 From: MemaroX Date: Tue, 9 Sep 2025 19:36:43 +0300 Subject: [PATCH] commit 1 --- app2.py | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ temp.py | 28 +++++++++ 2 files changed, 207 insertions(+) create mode 100644 app2.py create mode 100644 temp.py diff --git a/app2.py b/app2.py new file mode 100644 index 0000000..0d99d84 --- /dev/null +++ b/app2.py @@ -0,0 +1,179 @@ +import os +import uvicorn +import requests +from openai import OpenAI +import PyPDF2 +from dotenv import load_dotenv +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceEmbeddings +from fastapi import FastAPI, UploadFile, File, Form, HTTPException +from fastapi.responses import JSONResponse + +# Load environment variables +load_dotenv() + +# --- Configuration --- +GITPASHA_HOST = "https://rag-app-fa66b3d8eb83.hosted.ghaymah.systems" + +# Initialize FastAPI app +app = FastAPI( + title="Remote PDF Summarizer API", + description="Upload a PDF and get a summary using a remote RAG pipeline.", + version="2.1.0" +) + +# Client for final summarization +client = OpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="https://genai.ghaymah.systems" +) + +# Use a local embedding model that matches the remote vector store's expected dimension +print("Initializing local embedding model (jinaai/jina-embeddings-v2-small-en)...") +embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v2-small-en") +print("Embedding model loaded.") + +# --- Helper Functions --- + +def extract_text_from_pdf(pdf_stream): + """Extracts text from a PDF file stream and cleans it.""" + print("Extracting text from PDF stream...") + text = "" + try: + reader = PyPDF2.PdfReader(pdf_stream) + for page in reader.pages: + page_text = page.extract_text() + if page_text: + text += page_text + print("Text extraction complete.") + # Clean the extracted text for pure plain text + text = ' '.join(text.split()) + return text + except Exception as e: + print(f"An error occurred while reading the PDF: {e}") + raise HTTPException(status_code=500, detail=f"Failed to read PDF content: {e}") + +def store_text_chunks_remote(text): + """Splits text, creates embeddings, and stores them in the remote GitPasha vector store.""" + if not text: + print("Skipping storage: No text provided.") + return False + print("Splitting text into chunks...") + splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) + chunks = splitter.split_text(text) + + print(f"Creating embeddings for {len(chunks)} chunks...") + try: + # Ensure the embedding model produces 512-dim vectors + chunk_vectors = embeddings.embed_documents(chunks) + payloads = [{"text_chunk": chunk} for chunk in chunks] + except Exception as e: + print(f"Failed to create embeddings: {e}") + raise HTTPException(status_code=500, detail=f"Failed to create text embeddings: {e}") + + print("Uploading vectors and payloads to remote GitPasha vector store...") + try: + response = requests.post( + f"{GITPASHA_HOST}/insert", + json={"vectors": chunk_vectors, "payloads": payloads}, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() + print(f"→ POST /insert: {response.status_code}") + if response.status_code == 200: + print("Upload complete ✅") + return True + else: + raise HTTPException(status_code=response.status_code, detail=f"Failed to insert data into remote vector store: {response.text}") + except requests.exceptions.RequestException as e: + print(f"An error occurred while calling the remote /insert API: {e}") + raise HTTPException(status_code=500, detail=f"Error connecting to remote vector store: {e}") + +def get_summary_from_remote_rag( + query: str, + model: str = "DeepSeek-V3-0324" +): + """Creates a query embedding, searches remote GitPasha, and summarizes.""" + print(f"Creating embedding for query: '{query}'") + try: + query_vector = embeddings.embed_query(query) + except Exception as e: + print(f"Failed to create query embedding: {e}") + raise HTTPException(status_code=500, detail=f"Failed to create query embedding: {e}") + + print("Retrieving relevant context from remote GitPasha vector store...") + try: + response = requests.post( + f"{GITPASHA_HOST}/search", + json={"vector": query_vector, "k": 4}, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() + search_results = response.json() + except requests.exceptions.RequestException as e: + print(f"An error occurred while calling the remote /search API: {e}") + raise HTTPException(status_code=500, detail=f"Error searching remote vector store: {e}") + + if not search_results or 'results' not in search_results or not search_results['results']: + print("No relevant context found.") + return "Could not find any relevant context to generate a summary for the query." + + context = "\n\n".join([result['payload']['text_chunk'] for result in search_results['results']]) + + # Generate the final summary using the remote LLM + print("Generating final summary using remote LLM...") + try: + completion_response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant that summarizes documents based on the provided context."}, + {"role": "user", "content": f"Based on the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {query}"} + ] + ) + return completion_response.choices[0].message.content + except Exception as e: + print(f"An error occurred during final summarization: {e}") + raise HTTPException(status_code=500, detail=f"Failed to generate summary from AI model: {e}") + +# --- API Endpoints --- + +@app.post("/summarize/") +async def summarize_pdf( + file: UploadFile = File(...), + query: str = Form("Summarize the key points of this document.") +): + """ + Accepts a PDF file and a query, then returns a summary. + """ + if file.content_type != "application/pdf": + raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.") + + try: + # Extract text from the uploaded PDF file stream + pdf_text = extract_text_from_pdf(file.file) + + # Store the text chunks and their embeddings in the remote vector store + if not store_text_chunks_remote(pdf_text): + raise HTTPException(status_code=500, detail="Failed to process and store the document in remote vector store.") + + # Query, retrieve context, and generate the summary using remote RAG + summary = get_summary_from_remote_rag(query) + + return JSONResponse(content={"summary": summary}) + + except HTTPException as e: + # Re-raise HTTPException to be handled by FastAPI + raise e + except Exception as e: + # Catch any other unexpected errors + print(f"An unexpected error occurred: {e}") + raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}") + +@app.get("/") +def read_root(): + return {"message": "Welcome to the Remote PDF Summarizer API. Use /docs for documentation."} + +# --- Main execution --- + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..967e9ee --- /dev/null +++ b/temp.py @@ -0,0 +1,28 @@ +import requests +import json +import random + +HOST = "https://rag-app-fa66b3d8eb83.hosted.ghaymah.systems" +N_DIM = 512 + +def random_vector(): + """Generates a random 512-dimensional vector.""" + return [random.random() for _ in range(N_DIM)] + +# Create a sample 512-dimensional vector and a dummy payload +vectors_to_insert = [random_vector()] +payloads = [{"test_data": "This is a test payload for 512-dim vector."}] +print(vectors_to_insert) +print(f"Attempting to send a {N_DIM}-dimensional vector to {HOST}/insert...") + +try: + insert_resp = requests.post( + f"{HOST}/insert", + json={"vectors": vectors_to_insert, "payloads": payloads}, + headers={"Content-Type": "application/json"} + ) + insert_resp.raise_for_status() # Raise an exception for bad status codes + print("Response Status Code:", insert_resp.status_code) + print("Response Body:", insert_resp.text) +except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}")