diff --git a/vector_store_integration/RAG_USING_WEAVIATE.ipynb b/vector_store_integration/RAG_USING_WEAVIATE.ipynb new file mode 100644 index 0000000..8beb4ad --- /dev/null +++ b/vector_store_integration/RAG_USING_WEAVIATE.ipynb @@ -0,0 +1,236 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **End-to-End RAG Tutorial Using Salesforce, Airbyte Cloud, Weaviate, and LangChain**\n", + "This notebook illustrates the complete setup of a Retrieval-Augmented Generation (RAG) pipeline.
\n", + "We extract data from a GitHub repository using PyAirbyte, store the data in a Chroma vector store, and use LangChain to perform RAG on the stored data.
\n", + "## **Prerequisites**\n", + "**1) OpenAI API Key**:\n", + " - **Create an OpenAI Account**: Sign up for an account on [OpenAI](https://www.openai.com/).\n", + " - **Generate an API Key**: Go to the API section and generate a new API key. For detailed instructions, refer to the [OpenAI documentation](https://beta.openai.com/docs/quickstart).\n", + "\n", + "**2) Weaviate Cluster's Public URL and API Key**:\n", + "\n", + " - **Get your URL and Key**: Cick on your clusters drop down button. Visit [this](https://console.weaviate.cloud/).\n" + ], + "metadata": { + "id": "sW7Fx5ilH2jK" + } + }, + { + "cell_type": "markdown", + "source": [ + "# **Installing Dependencies**\n" + ], + "metadata": { + "id": "oFw9gVa9JKDH" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zAztxTpIBiMF" + }, + "outputs": [], + "source": [ + "# Add virtual environment support if needed\n", + "!apt-get install -qq python3.10-venv\n", + "\n", + "# Install required packages\n", + "%pip install --quiet openai langchain-openai tiktoken pandas weaviate-client langchain-weaviate langchain-community\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# **Set Up Environment Variables**" + ], + "metadata": { + "id": "wH5CoP0WJfjd" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "os.environ[\"WEAVIATE_URL\"] = \"YOUR_WEAVIATE_URL\"\n", + "os.environ[\"WEAVIATE_API_KEY\"] = \"YOUR_WEAVIATE_API_KEY\"\n", + "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"" + ], + "metadata": { + "id": "JDqAdjZTCbeB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Initialize Weaviate Vector Store**" + ], + "metadata": { + "id": "jGEqM1-RJlTE" + } + }, + { + "cell_type": "code", + "source": [ + "import weaviate\n", + "from weaviate.auth import AuthApiKey\n", + "from langchain_weaviate.vectorstores import WeaviateVectorStore\n", + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "import pandas as pd\n", + "\n", + "# Connect to Weaviate with API key\n", + "auth_config = AuthApiKey(api_key=os.getenv(\"WEAVIATE_API_KEY\"))\n", + "\n", + "try:\n", + " weaviate_client = weaviate.Client(\n", + " url=os.getenv(\"WEAVIATE_URL\"),\n", + " auth_client_secret=auth_config,\n", + " )\n", + " print(\"Successfully connected to Weaviate\", flush=True)\n", + "except Exception as e:\n", + " print(f\"Error connecting to Weaviate: {e}\", flush=True)\n", + "\n" + ], + "metadata": { + "id": "XnzOsbFaCepg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Embedding and similarity search with Weaviate**\n", + "Here we will convert the user's query into embeddings using OpenAI and retrieve similar chunks from Weaviate based on the query.
\n", + "### Note: Change collection and property according to your own requirement!" + ], + "metadata": { + "id": "EPEEwKcgJtI_" + } + }, + { + "cell_type": "code", + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from typing import List\n", + "\n", + "# Initialize OpenAI client for embeddings\n", + "openai_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv(\"OPENAI_API_KEY\"))\n", + "\n", + "# Convert user's query into a vector array to prep for similarity search\n", + "def get_embedding_from_openai(query) -> List[float]:\n", + " return openai_embeddings.embed_query(query)\n", + "\n", + "# Use Weaviate to find matching chunks\n", + "collection=\"Lead\"\n", + "property=\"name\"\n", + "def get_similar_chunks_from_weaviate(query: str) -> List[str]:\n", + " try:\n", + " embedding = get_embedding_from_openai(query)\n", + " near_vector = {\n", + " \"vector\": embedding\n", + " }\n", + " result = weaviate_client.query.get(collection, [property]).with_near_vector(near_vector).do()\n", + "\n", + " if 'data' in result and 'Get' in result['data'] and collection in result['data']['Get']:\n", + " chunks = [res[property] for res in result['data']['Get'][collection]]\n", + " return chunks\n", + " else:\n", + " print(\"Unexpected result format:\", result, flush=True)\n", + " return []\n", + " except Exception as e:\n", + " print(f\"Error during Weaviate query: {e}\", flush=True)\n", + " return []\n" + ], + "metadata": { + "id": "8SpJVDx4D23z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# **Building RAG Pipeline and asking a question**\n", + "Finally we use OpenAI for querying our data!
\n", + "We know the three main steps of a RAG Pipeline are :
\n", + "- Embedding incoming query
\n", + "- Doing similarity search to find matching chunks
\n", + "- Send chunks to LLM for completion" + ], + "metadata": { + "id": "ubPGiUFSKBhF" + } + }, + { + "cell_type": "code", + "source": [ + "from typing import List\n", + "from openai import OpenAI\n", + "\n", + "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n", + "\n", + "# Use OpenAI to complete the response\n", + "def get_completion_from_openai(question, document_chunks: List[str], model_name=\"gpt-3.5-turbo\"):\n", + " chunks = \"\\n\\n\".join(document_chunks)\n", + "\n", + " try:\n", + " completion = client.chat.completions.create(\n", + " model=model_name,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are an assistant. Answer the question based on the context. Do not use any other information. Be concise.\"},\n", + " {\"role\": \"user\", \"content\": f\"Context:\\n{chunks}\\n\\n{question}\\n\\nAnswer:\"}\n", + " ],\n", + " max_tokens=150\n", + " )\n", + " return completion.choices[0].message.content.strip()\n", + " except Exception as e:\n", + " print(f\"Error during OpenAI completion: {e}\", flush=True)\n", + " return \"There was an error generating the response.\"\n", + "\n", + "\n", + "# Putting it all together\n", + "def get_response(query, model_name=\"gpt-3.5-turbo\"):\n", + " chunks = get_similar_chunks_from_weaviate(query)\n", + " if len(chunks) == 0:\n", + " return \"I am sorry, I do not have the context to answer your question.\"\n", + " else:\n", + " return get_completion_from_openai(query, chunks, model_name)\n", + "\n", + "# Ask a question\n", + "query = 'How many lead work in BNY?'\n", + "response = get_response(query)\n", + "\n", + "print(f\"\\n\\nResponse from LLM:\\n\\n{response}\", flush=True)\n" + ], + "metadata": { + "id": "Qu1C3e1TE2YL" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file