Skip to content

Commit 36ffef4

Browse files
committed
first commit
0 parents  commit 36ffef4

File tree

7 files changed

+262
-0
lines changed

7 files changed

+262
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.venv/
2+
__pycache__/

chat.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from langchain_core.prompts import ChatPromptTemplate
2+
from langchain.chains.combine_documents import create_stuff_documents_chain
3+
from langchain.chains import create_retrieval_chain
4+
from langchain_chroma import Chroma
5+
from models import Models
6+
7+
# Initialize the models
8+
models = Models()
9+
embeddings = models.embeddings_ollama
10+
llm = models.model_ollama
11+
12+
# Initialize the vector store
13+
vector_store = Chroma(
14+
collection_name="documents",
15+
embedding_function=embeddings,
16+
persist_directory="./db/chroma_langchain_db", # Where to save data locally
17+
)
18+
19+
# Define the chat prompt
20+
prompt = ChatPromptTemplate.from_messages(
21+
[
22+
(
23+
"system",
24+
"You are an assistant. Your task is to answer questions based on the provided documents. "
25+
"If the data is insufficient, you can supplement your answer with your own knowledge. "
26+
"However, prioritize the provided data for accuracy."
27+
),
28+
("human", "Question: {input}\n\nContext: {context}"),
29+
]
30+
)
31+
32+
# Define the retrieval chain
33+
retriever = vector_store.as_retriever(kwargs={"k": 10}) # Retrieve top 10 documents
34+
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
35+
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
36+
37+
# Main loop
38+
def main():
39+
while True:
40+
query = input("User (or type 'q', 'quit', or 'exit' to end): ")
41+
if query.lower() in ['q', 'quit', 'exit']:
42+
break
43+
44+
result = retrieval_chain.invoke({"input": query})
45+
46+
# Check if RAG is used
47+
if result["context"]:
48+
print("\nAssistant is using Knowledge RAG to answer the question.")
49+
# Display the source documents or chunks used
50+
print("\nSources used for the answer:")
51+
for i, doc in enumerate(result["context"]):
52+
print(f"Source {i+1}:")
53+
print(f"Document: {doc.metadata.get('source', 'Unknown')}")
54+
print(f"Chunk: {doc.metadata.get('chunk_id', 'Unknown')}")
55+
print(f"Content: {doc.page_content[:200]}...") # Display first 200 characters of the chunk
56+
print("-" * 50)
57+
else:
58+
print("\nAssistant is answering based on its own knowledge (no RAG).")
59+
60+
# Display the answer
61+
print("\nAssistant: ", result["answer"], "\n\n")
62+
63+
# Run the main loop
64+
if __name__ == "__main__":
65+
main()

html-pdf.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
from pyppeteer import launch
3+
import asyncio
4+
5+
# Daftar URL yang akan diubah menjadi PDF
6+
urls = [
7+
"https://github.com/sqlmapproject/sqlmap",
8+
]
9+
10+
# Membuat direktori ./data jika belum ada
11+
if not os.path.exists('./data'):
12+
os.makedirs('./data')
13+
14+
async def save_page_as_pdf(url, output_path):
15+
# Tentukan jalur ke Chromium yang sudah terinstal
16+
browser = await launch(
17+
headless=True,
18+
executablePath='E:/Kuliah/Akademik/TA/RAG/chrome/win64-133.0.6943.126/chrome-win64/chrome.exe' # Ganti dengan jalur Chromium Anda
19+
)
20+
page = await browser.newPage()
21+
22+
# Mengarahkan ke URL
23+
await page.goto(url, {'waitUntil': 'networkidle2'})
24+
25+
# Menyimpan halaman sebagai PDF
26+
await page.pdf({'path': output_path, 'format': 'A4'})
27+
28+
# Menutup browser
29+
await browser.close()
30+
31+
async def main():
32+
for i, url in enumerate(urls):
33+
# Menentukan nama file PDF
34+
output_path = f'./data/page_{i+1}.pdf'
35+
36+
# Mengubah halaman web menjadi PDF
37+
await save_page_as_pdf(url, output_path)
38+
print(f'Saved {url} to {output_path}')
39+
40+
# Menjalankan fungsi utama
41+
asyncio.get_event_loop().run_until_complete(main())

ingest.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
import time
3+
from dotenv import load_dotenv
4+
from langchain_community.document_loaders import PyPDFLoader
5+
from langchain_text_splitters import RecursiveCharacterTextSplitter
6+
from langchain_chroma import Chroma
7+
from uuid import uuid4
8+
from models import Models
9+
10+
load_dotenv()
11+
12+
#Initialize the models
13+
models = Models()
14+
embeddings = models.embeddings_ollama
15+
llm = models.model_ollama
16+
17+
#Define constants
18+
data_folder = "./data"
19+
chunk_size = 1000
20+
chunk_overlap = 50
21+
check_interval = 10
22+
23+
# Chroma vector store
24+
vector_store = Chroma(
25+
collection_name="documents",
26+
embedding_function=embeddings,
27+
persist_directory="./db/chroma_langchain_db", #Where to save data locally
28+
)
29+
30+
def ingest_file(file_path):
31+
if not file_path.lower().endswith('.pdf'):
32+
print(f"Skipping non-PDF file: {file_path}")
33+
return
34+
35+
print(f"Starting to ingest file: {file_path}")
36+
loader = PyPDFLoader(file_path)
37+
loaded_documents = loader.load()
38+
text_splitter = RecursiveCharacterTextSplitter(
39+
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n", " ", ""]
40+
)
41+
42+
documents = text_splitter.split_documents(loaded_documents)
43+
uuids = [str(uuid4()) for _ in range(len(documents))]
44+
for doc, uuid in zip(documents, uuids):
45+
doc.metadata["source"] = os.path.basename(file_path) # Simpan nama file
46+
doc.metadata["chunk_id"] = uuid # Simpan ID chunk
47+
48+
print(f"Adding {len(documents)} documents to the vector store")
49+
vector_store.add_documents(documents=documents, ids=uuids)
50+
print(f"Finished ingesting file: {file_path}")
51+
52+
# Main loop
53+
def main_loop():
54+
while True:
55+
for filename in os.listdir(data_folder):
56+
if not filename.startswith("_"):
57+
file_path = os.path.join(data_folder, filename)
58+
ingest_file(file_path)
59+
new_filename = "_" + filename
60+
new_file_path = os.path.join(data_folder, new_filename)
61+
os.rename(file_path, new_file_path)
62+
63+
time.sleep(check_interval) # Check the folder every 10 seconds
64+
65+
# Run the main loop
66+
if __name__ == "__main__":
67+
main_loop()

models.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
from langchain_ollama import OllamaEmbeddings, ChatOllama
3+
4+
class Models:
5+
def __init__(self):
6+
"""Inisialisasi model lokal dengan Ollama."""
7+
8+
# IP WSL (ganti sesuai IP yang ditemukan di langkah 1)
9+
ollama_host = "http://127.0.0.1:11434"
10+
11+
# Model embedding Ollama
12+
self.embeddings_ollama = OllamaEmbeddings(model="nomic-embed-text", base_url=ollama_host)
13+
14+
# Model LLM Ollama
15+
self.model_ollama = ChatOllama(model="qwen2.5", temperature=0, base_url=ollama_host)
16+
17+
# Contoh penggunaan
18+
local_model = Models()

readme.md

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# RAG with LangChain and Ollama
2+
3+
This project is an implementation of **Retrieval-Augmented Generation (RAG)** using **LangChain**, **ChromaDB**, and **Ollama** to enhance answer accuracy in an LLM-based (Large Language Model) system. The system performs document-based retrieval and answers user questions using data stored in the vector database.
4+
5+
## Features
6+
- **Integration with Ollama** as the LLM model.
7+
- **ChromaDB** as a vector storage for documents.
8+
- **RAG pipeline** for document retrieval and processing.
9+
- **PDF ingestion support** to add documents to the vector database.
10+
- **Interactive system** for answering user queries.
11+
12+
## Installation
13+
### 1. Requirements
14+
Ensure Python is installed (recommended version 3.8+).
15+
16+
### 2. Clone the Repository
17+
```bash
18+
git clone https://github.com/username/rag-langchain-ollama.git
19+
cd rag-langchain-ollama
20+
```
21+
22+
### 3. Install Dependencies
23+
```bash
24+
pip install -r requirements.txt
25+
```
26+
27+
### 4. Run Ollama
28+
Ensure the **Ollama** server is running locally (example with WSL):
29+
```bash
30+
ollama serve
31+
```
32+
33+
## Usage
34+
### 1. Run the Ingestor to Add Documents
35+
```bash
36+
python ingest.py
37+
```
38+
PDF files in the `data/` folder will be processed and stored in **ChromaDB**.
39+
40+
### 2. Run the Interactive Chatbot
41+
```bash
42+
python main.py
43+
```
44+
Use this chatbot to ask questions based on indexed documents.
45+
46+
## Project Structure
47+
```
48+
├── data/ # Folder for PDF documents
49+
├── db/ # ChromaDB storage folder
50+
├── models.py # Ollama model used
51+
├── ingest.py # Script for processing documents
52+
├── chat.py # Interactive chatbot
53+
├── requirements.txt # List of dependencies
54+
└── README.md # Project documentation
55+
```
56+
57+
## License
58+
This project is licensed under MIT. Feel free to use and modify it as needed.
59+

requirements.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
langchain-core
2+
langchain
3+
langchain-chroma
4+
langchain-community
5+
langchain-text-splitters
6+
langchain-ollama
7+
python-dotenv
8+
PyPDF2
9+
uuid
10+
cryptography>=3.1

0 commit comments

Comments
 (0)