This is part two of the three-part RAG series. In this part, we will set up a Python script to load a dataset and pass the embedded text to OpenSearch.
Step 1: Bootstrap the Python Script
First, create a requirements.txt
file with the following dependencies:
llama-index>=0.10.0
llama-index-readers-elasticsearch>=0.1.0
llama-index-vector-stores-opensearch>=0.1.0
llama-index-embeddings-ollama>=0.1.0
llama-index-embeddings-huggingface>=0.1.0
llama-index-embeddings-langchain>=0.1.0
langchain-huggingface>=0.0.9
langchain-community>=0.0.24
langchain>=0.1.9
ollama>=0.1.6
nest-asyncio>=1.6.0
torch>=2.2.0
transformers>=4.37.0
sentence-transformers>=2.4.0
Install the dependencies:
python -m pip install -r requirements.txt
Import the required modules:
import torch
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.vector_stores.opensearch import (
OpensearchVectorStore,
OpensearchVectorClient,
)
Step 2: Configure Hardware Acceleration
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
Step 3: Load and Process the Dataset
We’ll use the State of the Union dataset for this example. Download it using the following command:
wget https://huggingface.co/datasets/rewoo/sotu_qa_2023/resolve/main/state_of_the_union.txt
Next, load and split the document into chunks:
# Load and prepare documents
loader = TextLoader("state_of_the_union.txt")
raw_documents = loader.load()
documents = [
Document(text=doc.page_content, id=str(i))
for i, doc in enumerate(raw_documents)
]
# Configure text splitter
splitter = TokenTextSplitter(
chunk_size=512,
chunk_overlap=128,
separator=" ",
)
Step 4: Configure Embedding Model
Set up the embedding model using HuggingFace’s all-MiniLM-L6-v2:
# Split documents into nodes
token_nodes = splitter.get_nodes_from_documents(
documents,
show_progress=True
)
# Configure embeddings
embedding_model = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2",
model_kwargs={'device': device}
)
embeddings = embedding_model.embed_query("box")
dim = len(embeddings)
Step 5: Setup hybrid search pipeline
Login to Opensearch Dashboards and from the Dev Tools create the search pipeline.
PUT _search/pipeline/hybrid-search-pipeline
{
"description": "Pipeline for hybrid search",
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "harmonic_mean",
"parameters": {
"weights": [
0.3,
0.7
]
}
}
}
}
]
}
Step 6: Configure OpenSearch.
Set up OpenSearch configuration and initialize the client:
# OpenSearch configuration
opensearch_config = {
"endpoint": "https://localhost:9200",
"index": "test_pdf_index",
"text_field": "content_text",
"embedding_field": "embedding",
}
# Initialize OpenSearch client
client = OpensearchVectorClient(
endpoint=opensearch_config["endpoint"],
index=opensearch_config["index"],
dim=dim,
embedding_field=opensearch_config["embedding_field"],
text_field=opensearch_config["text_field"],
search_pipeline="hybrid-search-pipeline",
verify_certs=False,
http_auth=("admin", "admin"),
)
Step 7: Initialize Vector Store and Index
Create the vector store and index:
# Set up vector store and index
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
token_nodes,
storage_context=storage_context,
embed_model=embedding_model
)
Complete Code Example
Below is the complete code for reference:
import torch
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.vector_stores.opensearch import (
OpensearchVectorStore,
OpensearchVectorClient,
)
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load and prepare documents
loader = TextLoader("state_of_the_union.txt")
raw_documents = loader.load()
documents = [
Document(text=doc.page_content, id=str(i))
for i, doc in enumerate(raw_documents)
]
# Configure text splitter
splitter = TokenTextSplitter(
chunk_size=512,
chunk_overlap=128,
separator=" ",
)
# Split documents into nodes
token_nodes = splitter.get_nodes_from_documents(
documents,
show_progress=True
)
# Configure embeddings
embedding_model = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2",
model_kwargs={'device': device}
)
embeddings = embedding_model.embed_query("box")
dim = len(embeddings)
# OpenSearch configuration
opensearch_config = {
"endpoint": "https://localhost:9200",
"index": "test_pdf_index",
"text_field": "content_text",
"embedding_field": "embedding",
}
# Initialize OpenSearch client
client = OpensearchVectorClient(
endpoint=opensearch_config["endpoint"],
index=opensearch_config["index"],
dim=dim,
embedding_field=opensearch_config["embedding_field"],
text_field=opensearch_config["text_field"],
search_pipeline="hybrid-search-pipeline",
verify_certs=False,
http_auth=("admin", "admin"),
)
# Set up vector store and index
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
token_nodes,
storage_context=storage_context,
embed_model=embedding_model
)
What’s Next?
In the next part, we will search Opensearch and query to a self-hosted LLM for further processing. Stay tuned!