Skip to content

IBM Db2 Integration with Graphbit

Overview

This guideline explains how to use IBM Db2 as a vector database within the Graphbit ecosystem, leveraging OpenAI embeddings. You will learn how to connect, store, and search data and vectors using IBM Db2's native capabilities.


Prerequisites

  • IBM Db2 12.1.2 installed and running
  • Python environment with ibm-db and graphbit installed:
    pip install ibm-db graphbit python-dotenv numpy
    
  • Environment variables for your configuration:
    OPENAI_API_KEY=<your-openai-api-key>
    DB2_DATABASE=<your-database-name>
    DB2_HOST=<your-host>
    DB2_PORT=<your-port>
    DB2_USERNAME=<your-username>
    DB2_PASSWORD=<your_password>
    

Step 1: Connect to IBM Db2 and Ensure Table Exists

import ibm_db
import os
import json
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connect to IBM Db2
database = os.getenv("DB2_DATABASE", "your-database-name")
host = os.getenv("DB2_HOST", "your-host")
port = os.getenv("DB2_PORT", "your-port")
username = os.getenv("DB2_USERNAME", "your-username")
password = os.getenv("DB2_PASSWORD", "your_password")

conn_str = f"DATABASE={database};HOSTNAME={host};PORT={port};PROTOCOL=TCPIP;UID={username};PWD={password};"
connection = ibm_db.connect(conn_str, "", "")

# Try to connect to Db2
conn = ibm_db.connect(conn_str, "", "")
print(f"Connection successful!")

# Create table if it doesn't exist
table_name = "your-table-name"

# Drop the table before creating it again
drop_table_sql = f"DROP TABLE IF EXISTS {table_name}"
ibm_db.exec_immediate(conn, drop_table_sql)

create_table_sql = f"""
CREATE TABLE {table_name} (
    id VARCHAR(255) NOT NULL PRIMARY KEY,
    text_content CLOB,
    embedding BLOB,
    metadata CLOB
)
"""
try:
    ibm_db.exec_immediate(conn, create_table_sql)
    print(f"Table '{table_name}' created successfully.")
except Exception as e:
    print(f"Table creation error (may already exist): {str(e)}")

Step 2: Store and Search Vectors with OpenAI Embeddings

2.1. Generate and Store an Embedding

import json
from graphbit import EmbeddingConfig as gb_ecg
from graphbit import EmbeddingClient as gb_etc

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
embedding_config = gb_ecg.openai(OPENAI_API_KEY, "text-embedding-3-small")
embedding_client = gb_etc(embedding_config)

# Insert a single embedding
doc_text = "This is a sample document for vector search."
try:
    embedding = embedding_client.embed(doc_text)
    print(f"Generated embedding with {len(embedding)} dimensions")
    #print(f"Generated embedding: {embedding}") 

    # Convert embedding to bytes for BLOB storage
    embedding_bytes = struct.pack(f'{len(embedding)}f', *embedding)
    print(f"Embedding bytes length: {len(embedding_bytes)}")

    # Prepare metadata as JSON string
    metadata = {"category": "test", "source": "sample"}
    metadata_json = json.dumps(metadata)

    insert_sql = f"""
    INSERT INTO {table_name} (id, text_content, embedding, metadata)
    VALUES ('item_1', ?, ?, ?)
    """
    stmt = ibm_db.prepare(conn, insert_sql)
    ibm_db.bind_param(stmt, 1, doc_text)
    ibm_db.bind_param(stmt, 2, embedding_bytes)
    ibm_db.bind_param(stmt, 3, metadata_json)
    ibm_db.execute(stmt)
    print(f"Inserted embedding for item 'item123'.")
except Exception as e:
    print(f"Error inserting embedding: {str(e)}")

2.2. Batch embedding insert

batch_texts = [
    "Graph databases are great for relationships.",
    "Vector search enables semantic retrieval.",
    "OpenAI provides powerful embedding models.",
]

try:
    batch_embeddings = embedding_client.embed_many(batch_texts)
    print(f"Generated {len(batch_embeddings)} batch embeddings")
    # print(f"Generated batch embeddings: {batch_embeddings}") 

    for idx, (text, emb) in enumerate(zip(batch_texts, batch_embeddings)):
        # Convert embedding to bytes for BLOB storage
        embedding_bytes = struct.pack(f'{len(emb)}f', *emb)
        print(f"Embedding bytes length: {len(embedding_bytes)}")

        # Prepare metadata as JSON string
        metadata = {"text": text, "batch_id": "batch__{str(uuid.uuid4())}", "index": idx}
        metadata_json = json.dumps(metadata)

        insert_sql_batch = f"""
        INSERT INTO {table_name} (id, text_content, embedding, metadata)
        VALUES ('batch_{idx}', ?, ?, ?)
        """
        stmt_batch = ibm_db.prepare(conn, insert_sql_batch)
        ibm_db.bind_param(stmt_batch, 1, text)
        ibm_db.bind_param(stmt_batch, 2, embedding_bytes)
        ibm_db.bind_param(stmt_batch, 3, metadata_json)
        ibm_db.execute(stmt_batch)
        print(f"Inserted embedding for item 'batch_{idx}'.")
except Exception as e:
    print(f"Error inserting batch embeddings: {str(e)}")

2.3. Vector Similarity Search using Graphbit

query_text = "Find documents related to vector search."
query_embedding = embedding_client.embed(query_text)

# Get the stored metadata to regenerate embeddings
select_metadata_sql = f"""
SELECT id, text_content, metadata 
FROM {table_name}
"""
stmt_metadata = ibm_db.prepare(conn, select_metadata_sql)
ibm_db.execute(stmt_metadata)

stored_embeddings = []
while ibm_db.fetch_row(stmt_metadata):
    item_id = ibm_db.result(stmt_metadata, 0)
    text_content = ibm_db.result(stmt_metadata, 1)
    metadata = ibm_db.result(stmt_metadata, 2)

    print(f"Processing {item_id}: text_content = {text_content[:50]}...")

    if text_content:
        try:
            # Regenerate the embedding from the original text
            regenerated_embedding = embedding_client.embed(text_content)
            print(f"  Regenerated embedding: {len(regenerated_embedding)} dimensions")

            stored_embeddings.append((item_id, regenerated_embedding, metadata))
        except Exception as e:
            print(f"  Error regenerating embedding for {item_id}: {str(e)}")
    else:
        print(f"  No text content for {item_id}")

# Perform vector search on retrieved embeddings
print(f"\nPerforming vector search on {len(stored_embeddings)} retrieved embeddings...")

best_score = -1
best_item = None

for item_id, stored_embedding, metadata in stored_embeddings:
    try:
        # Calculate the similarity score between the query and the stored embedding
        score = gb_etc.similarity(query_embedding, stored_embedding)
        print(f"Similarity score for {item_id}: {score:.4f}")

        if score > best_score:
            best_score = score
            best_item = (item_id, metadata)
    except Exception as e:
        print(f"Error calculating similarity for {item_id}: {str(e)}")
        continue

# Print the most similar item
if best_item is not None:
    print(f"\nMost similar document: {best_item[0]} with score {best_score:.4f}")
    print(f"Metadata: {best_item[1]}")
else:
    print("No documents found in vector table.")

This connector pattern enables you to use IBM Db2 as a vector database in your AI workflows, orchestrated by Graphbit.