first commit

siddiqodiq · siddiqodiq · commit 36ffef41fa35 · 2025-03-11T14:38:51.000+07:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.venv/
+__pycache__/
diff --git a/chat.py b/chat.py
@@ -0,0 +1,65 @@
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from langchain_chroma import Chroma
+from models import Models
+
+# Initialize the models
+models = Models()
+embeddings = models.embeddings_ollama
+llm = models.model_ollama
+
+# Initialize the vector store
+vector_store = Chroma(
+    collection_name="documents",
+    embedding_function=embeddings,
+    persist_directory="./db/chroma_langchain_db",  # Where to save data locally
+)
+
+# Define the chat prompt
+prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are an assistant. Your task is to answer questions based on the provided documents. "
+            "If the data is insufficient, you can supplement your answer with your own knowledge. "
+            "However, prioritize the provided data for accuracy."
+        ),
+        ("human", "Question: {input}\n\nContext: {context}"),
+    ]
+)
+
+# Define the retrieval chain
+retriever = vector_store.as_retriever(kwargs={"k": 10})  # Retrieve top 10 documents
+combine_docs_chain = create_stuff_documents_chain(llm, prompt)
+retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
+
+# Main loop
+def main():
+    while True:
+        query = input("User (or type 'q', 'quit', or 'exit' to end): ")
+        if query.lower() in ['q', 'quit', 'exit']:
+            break
+
+        result = retrieval_chain.invoke({"input": query})
+        
+        # Check if RAG is used
+        if result["context"]:
+            print("\nAssistant is using Knowledge RAG to answer the question.")
+            # Display the source documents or chunks used
+            print("\nSources used for the answer:")
+            for i, doc in enumerate(result["context"]):
+                print(f"Source {i+1}:")
+                print(f"Document: {doc.metadata.get('source', 'Unknown')}")
+                print(f"Chunk: {doc.metadata.get('chunk_id', 'Unknown')}")
+                print(f"Content: {doc.page_content[:200]}...")  # Display first 200 characters of the chunk
+                print("-" * 50)
+        else:
+            print("\nAssistant is answering based on its own knowledge (no RAG).")
+        
+        # Display the answer
+        print("\nAssistant: ", result["answer"], "\n\n")
+
+# Run the main loop
+if __name__ == "__main__":
+    main()
diff --git a/html-pdf.py b/html-pdf.py
@@ -0,0 +1,41 @@
+import os
+from pyppeteer import launch
+import asyncio
+
+# Daftar URL yang akan diubah menjadi PDF
+urls = [
+    "https://github.com/sqlmapproject/sqlmap",
+]
+
+# Membuat direktori ./data jika belum ada
+if not os.path.exists('./data'):
+    os.makedirs('./data')
+
+async def save_page_as_pdf(url, output_path):
+    # Tentukan jalur ke Chromium yang sudah terinstal
+    browser = await launch(
+        headless=True,
+        executablePath='E:/Kuliah/Akademik/TA/RAG/chrome/win64-133.0.6943.126/chrome-win64/chrome.exe'  # Ganti dengan jalur Chromium Anda
+    )
+    page = await browser.newPage()
+
+    # Mengarahkan ke URL
+    await page.goto(url, {'waitUntil': 'networkidle2'})
+
+    # Menyimpan halaman sebagai PDF
+    await page.pdf({'path': output_path, 'format': 'A4'})
+
+    # Menutup browser
+    await browser.close()
+
+async def main():
+    for i, url in enumerate(urls):
+        # Menentukan nama file PDF
+        output_path = f'./data/page_{i+1}.pdf'
+        
+        # Mengubah halaman web menjadi PDF
+        await save_page_as_pdf(url, output_path)
+        print(f'Saved {url} to {output_path}')
+
+# Menjalankan fungsi utama
+asyncio.get_event_loop().run_until_complete(main())
diff --git a/ingest.py b/ingest.py
@@ -0,0 +1,67 @@
+import os 
+import time 
+from dotenv import load_dotenv 
+from langchain_community.document_loaders import PyPDFLoader 
+from langchain_text_splitters import RecursiveCharacterTextSplitter 
+from langchain_chroma import Chroma 
+from uuid import uuid4 
+from models import Models 
+
+load_dotenv()
+
+#Initialize the models 
+models =  Models() 
+embeddings = models.embeddings_ollama 
+llm = models.model_ollama 
+
+#Define constants 
+data_folder = "./data" 
+chunk_size = 1000 
+chunk_overlap = 50 
+check_interval = 10 
+
+# Chroma vector store 
+vector_store = Chroma(
+    collection_name="documents", 
+    embedding_function=embeddings, 
+    persist_directory="./db/chroma_langchain_db",  #Where to save data locally
+)
+
+def ingest_file(file_path):
+    if not file_path.lower().endswith('.pdf'):
+        print(f"Skipping non-PDF file: {file_path}")
+        return
+
+    print(f"Starting to ingest file: {file_path}")
+    loader = PyPDFLoader(file_path)
+    loaded_documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n", " ", ""]
+    )
+
+    documents = text_splitter.split_documents(loaded_documents)
+    uuids = [str(uuid4()) for _ in range(len(documents))]
+    for doc, uuid in zip(documents, uuids):
+        doc.metadata["source"] = os.path.basename(file_path)  # Simpan nama file
+        doc.metadata["chunk_id"] = uuid  # Simpan ID chunk
+
+    print(f"Adding {len(documents)} documents to the vector store")
+    vector_store.add_documents(documents=documents, ids=uuids)
+    print(f"Finished ingesting file: {file_path}")
+    
+# Main loop
+def main_loop():
+    while True:
+        for filename in os.listdir(data_folder):
+            if not filename.startswith("_"):
+                file_path = os.path.join(data_folder, filename)
+                ingest_file(file_path)
+                new_filename = "_" + filename
+                new_file_path = os.path.join(data_folder, new_filename)
+                os.rename(file_path, new_file_path)
+
+        time.sleep(check_interval)  # Check the folder every 10 seconds
+
+# Run the main loop
+if __name__ == "__main__":
+    main_loop()
diff --git a/models.py b/models.py
@@ -0,0 +1,18 @@
+import os
+from langchain_ollama import OllamaEmbeddings, ChatOllama
+
+class Models:
+    def __init__(self):
+        """Inisialisasi model lokal dengan Ollama."""
+
+        # IP WSL (ganti sesuai IP yang ditemukan di langkah 1)
+        ollama_host = "http://127.0.0.1:11434"
+
+        # Model embedding Ollama
+        self.embeddings_ollama = OllamaEmbeddings(model="nomic-embed-text", base_url=ollama_host)
+
+        # Model LLM Ollama
+        self.model_ollama = ChatOllama(model="qwen2.5", temperature=0, base_url=ollama_host)
+
+# Contoh penggunaan
+local_model = Models()
diff --git a/readme.md b/readme.md
@@ -0,0 +1,59 @@
+# RAG with LangChain and Ollama
+
+This project is an implementation of **Retrieval-Augmented Generation (RAG)** using **LangChain**, **ChromaDB**, and **Ollama** to enhance answer accuracy in an LLM-based (Large Language Model) system. The system performs document-based retrieval and answers user questions using data stored in the vector database.
+
+## Features
+- **Integration with Ollama** as the LLM model.
+- **ChromaDB** as a vector storage for documents.
+- **RAG pipeline** for document retrieval and processing.
+- **PDF ingestion support** to add documents to the vector database.
+- **Interactive system** for answering user queries.
+
+## Installation
+### 1. Requirements
+Ensure Python is installed (recommended version 3.8+).
+
+### 2. Clone the Repository
+```bash
+git clone https://github.com/username/rag-langchain-ollama.git
+cd rag-langchain-ollama
+```
+
+### 3. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+
+### 4. Run Ollama
+Ensure the **Ollama** server is running locally (example with WSL):
+```bash
+ollama serve
+```
+
+## Usage
+### 1. Run the Ingestor to Add Documents
+```bash
+python ingest.py
+```
+PDF files in the `data/` folder will be processed and stored in **ChromaDB**.
+
+### 2. Run the Interactive Chatbot
+```bash
+python main.py
+```
+Use this chatbot to ask questions based on indexed documents.
+
+## Project Structure
+```
+├── data/                  # Folder for PDF documents
+├── db/                    # ChromaDB storage folder
+├── models.py              # Ollama model used
+├── ingest.py              # Script for processing documents
+├── chat.py                # Interactive chatbot
+├── requirements.txt       # List of dependencies
+└── README.md              # Project documentation
+```
+
+## License
+This project is licensed under MIT. Feel free to use and modify it as needed.
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+langchain-core
+langchain
+langchain-chroma
+langchain-community
+langchain-text-splitters
+langchain-ollama
+python-dotenv
+PyPDF2
+uuid
+cryptography>=3.1