import os import argparse import requests import json from openai import OpenAI import PyPDF2 from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings load_dotenv() # --- Configuration --- GITPASHA_HOST = "https://serverless-store-77838979b96f.hosted.ghaymah.systems" # Client for final summarization client = OpenAI( api_key=os.environ.get("OPENAI_API_KEY"), base_url="https://genai.ghaymah.systems" ) # Client for creating embeddings embeddings = OpenAIEmbeddings( openai_api_key=os.environ.get("OPENAI_API_KEY"), openai_api_base="https://genai.ghaymah.systems" ) def extract_text_from_pdf(pdf_path): """Extracts text from a PDF file.""" print(f"Extracting text from {pdf_path}...") text = "" try: with open(pdf_path, "rb") as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text except FileNotFoundError: print(f"Error: The file at {pdf_path} was not found.") return None except Exception as e: print(f"An error occurred while reading the PDF: {e}") return None print("Text extraction complete.") return text def store_text_chunks(text): """Splits text, creates embeddings, and stores them in GitPasha.""" print("Splitting text into chunks...") splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = splitter.split_text(text) print(f"Creating embeddings for {len(chunks)} chunks...") try: chunk_vectors = embeddings.embed_documents(chunks) payloads = [{"text_chunk": chunk} for chunk in chunks] except Exception as e: print(f"Failed to create embeddings: {e}") return False print("Uploading vectors and payloads to GitPasha...") try: response = requests.post( f"{GITPASHA_HOST}/insert", json={"vectors": chunk_vectors, "payloads": payloads}, headers={"Content-Type": "application/json"} ) response.raise_for_status() # Raise an exception for bad status codes print("→ POST /insert:", response.status_code, response.text) if response.status_code == 200: print("Upload complete ✅") return True else: print(f"Failed to insert data. Status: {response.status_code}, Response: {response.text}") return False except requests.exceptions.RequestException as e: print(f"An error occurred while calling the /insert API: {e}") return False def summarize_with_context(query, model="DeepSeek-V3-0324"): """Creates a query embedding, searches GitPasha, and summarizes.""" print(f"Creating embedding for query: '{query}'") try: query_vector = embeddings.embed_query(query) except Exception as e: print(f"Failed to create query embedding: {e}") return None print("Retrieving relevant context from GitPasha...") try: response = requests.post( f"{GITPASHA_HOST}/search", json={"vector": query_vector, "k": 4}, headers={"Content-Type": "application/json"} ) response.raise_for_status() print("→ POST /search:", response.status_code) search_results = response.json() except requests.exceptions.RequestException as e: print(f"An error occurred while calling the /search API: {e}") return None if not search_results or 'results' not in search_results: print("No relevant context found.") return "Could not find any relevant context to generate a summary." context = "\n\n".join([result['payload']['text_chunk'] for result in search_results['results']]) print("Generating final summary...") try: response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a helpful assistant that summarizes documents based on the provided context."}, {"role": "user", "content": f"Based on the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {query}"} ] ) return response.choices[0].message.content except Exception as e: print(f"An error occurred during final summarization: {e}") return None def main(): parser = argparse.ArgumentParser(description="Summarize a PDF using a remote Serverless Vector Store + AI.") parser.add_argument("pdf_path", help="Path to the PDF file.") parser.add_argument("--query", help="Custom question about the PDF.", default="Summarize the key points of this document.") args = parser.parse_args() # 1. Extract text from the PDF pdf_text = extract_text_from_pdf(args.pdf_path) if not pdf_text: print("Aborting due to empty text from PDF.") return # 2. Store the text chunks and their embeddings if not store_text_chunks(pdf_text): print("Aborting due to failure in storing document.") return # 3. Query, retrieve context, and summarize summary = summarize_with_context(args.query) if summary: print("\n--- Contextual Summary ---") print(summary) if __name__ == "__main__": main()