commit 6639d3f4c27ebbcb2313a518db9f5fe87c24a5a6 Author: MemaroX Date: Tue Sep 9 13:23:54 2025 +0300 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66bca12 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +venv/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e92e64e --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# PDF Summarizer V1 + +This project is a command-line tool to summarize PDF documents using a two-step process with AI models. + +## How it works + +1. **Extracts Text**: The tool first extracts the text content from the provided PDF file. +2. **Initial Summary**: It then uses a generative AI model (e.g., `gemma-3-4b-it`) to create an initial summary of the text. +3. **Refined Summary**: This initial summary is then passed to a second, potentially more advanced model (e.g., `QwQ-32B`), to refine and improve the summary. + +## How to use + +1. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` +2. Place your PDF file in the `uploads` directory. +3. Run the application from your terminal: + ```bash + python app.py uploads/your_file.pdf + ``` diff --git a/app.py b/app.py new file mode 100644 index 0000000..0a60c54 --- /dev/null +++ b/app.py @@ -0,0 +1,148 @@ +import os +import argparse +import requests +import json +from openai import OpenAI +import PyPDF2 +from dotenv import load_dotenv +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings.openai import OpenAIEmbeddings + +load_dotenv() + +# --- Configuration --- +GITPASHA_HOST = "https://serverless-store-77838979b96f.hosted.ghaymah.systems" + +# Client for final summarization +client = OpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + base_url="https://genai.ghaymah.systems" +) + +# Client for creating embeddings +embeddings = OpenAIEmbeddings( + openai_api_key=os.environ.get("OPENAI_API_KEY"), + openai_api_base="https://genai.ghaymah.systems" +) + +def extract_text_from_pdf(pdf_path): + """Extracts text from a PDF file.""" + print(f"Extracting text from {pdf_path}...") + text = "" + try: + with open(pdf_path, "rb") as f: + reader = PyPDF2.PdfReader(f) + for page in reader.pages: + page_text = page.extract_text() + if page_text: + text += page_text + except FileNotFoundError: + print(f"Error: The file at {pdf_path} was not found.") + return None + except Exception as e: + print(f"An error occurred while reading the PDF: {e}") + return None + print("Text extraction complete.") + return text + +def store_text_chunks(text): + """Splits text, creates embeddings, and stores them in GitPasha.""" + print("Splitting text into chunks...") + splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) + chunks = splitter.split_text(text) + + print(f"Creating embeddings for {len(chunks)} chunks...") + try: + chunk_vectors = embeddings.embed_documents(chunks) + payloads = [{"text_chunk": chunk} for chunk in chunks] + except Exception as e: + print(f"Failed to create embeddings: {e}") + return False + + print("Uploading vectors and payloads to GitPasha...") + try: + response = requests.post( + f"{GITPASHA_HOST}/insert", + json={"vectors": chunk_vectors, "payloads": payloads}, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() # Raise an exception for bad status codes + print("→ POST /insert:", response.status_code, response.text) + if response.status_code == 200: + print("Upload complete ✅") + return True + else: + print(f"Failed to insert data. Status: {response.status_code}, Response: {response.text}") + return False + except requests.exceptions.RequestException as e: + print(f"An error occurred while calling the /insert API: {e}") + return False + +def summarize_with_context(query, model="DeepSeek-V3-0324"): + """Creates a query embedding, searches GitPasha, and summarizes.""" + print(f"Creating embedding for query: '{query}'") + try: + query_vector = embeddings.embed_query(query) + except Exception as e: + print(f"Failed to create query embedding: {e}") + return None + + print("Retrieving relevant context from GitPasha...") + try: + response = requests.post( + f"{GITPASHA_HOST}/search", + json={"vector": query_vector, "k": 4}, + headers={"Content-Type": "application/json"} + ) + response.raise_for_status() + print("→ POST /search:", response.status_code) + search_results = response.json() + except requests.exceptions.RequestException as e: + print(f"An error occurred while calling the /search API: {e}") + return None + + if not search_results or 'results' not in search_results: + print("No relevant context found.") + return "Could not find any relevant context to generate a summary." + + context = "\n\n".join([result['payload']['text_chunk'] for result in search_results['results']]) + + print("Generating final summary...") + try: + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant that summarizes documents based on the provided context."}, + {"role": "user", "content": f"Based on the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {query}"} + ] + ) + return response.choices[0].message.content + except Exception as e: + print(f"An error occurred during final summarization: {e}") + return None + +def main(): + parser = argparse.ArgumentParser(description="Summarize a PDF using a remote Serverless Vector Store + AI.") + parser.add_argument("pdf_path", help="Path to the PDF file.") + parser.add_argument("--query", help="Custom question about the PDF.", default="Summarize the key points of this document.") + args = parser.parse_args() + + # 1. Extract text from the PDF + pdf_text = extract_text_from_pdf(args.pdf_path) + if not pdf_text: + print("Aborting due to empty text from PDF.") + return + + # 2. Store the text chunks and their embeddings + if not store_text_chunks(pdf_text): + print("Aborting due to failure in storing document.") + return + + # 3. Query, retrieve context, and summarize + summary = summarize_with_context(args.query) + if summary: + print("\n--- Contextual Summary ---") + print(summary) + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6fcfbe7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +openai +PyPDF2 +python-dotenv +langchain +tiktoken +requests \ No newline at end of file diff --git a/uploads/test1.pdf b/uploads/test1.pdf new file mode 100644 index 0000000..351ab88 Binary files /dev/null and b/uploads/test1.pdf differ