Semantic Search and Analysis¶
This example demonstrates how to build a semantic search and analysis system using GraphBit's embedding capabilities and LLM integration.
Overview¶
We'll create a system that: 1. Embeds text documents for semantic search 2. Searches for semantically similar content
3. Analyzes results with LLM insights 4. Compares multiple documents for similarity 5. Generates intelligent summaries
Complete Example¶
from graphbit import init, EmbeddingConfig, EmbeddingClient, LlmConfig, LlmClient
import os
from typing import List, Dict
class SemanticSearchSystem:
def __init__(self, openai_api_key: str):
"""Initialize the semantic search system."""
# Initialize GraphBit
init(enable_tracing=True)
# Configure embeddings
self.embedding_config = EmbeddingConfig.openai(
api_key=openai_api_key,
model="text-embedding-3-small"
)
self.embedding_client = EmbeddingClient(self.embedding_config)
# Configure LLM for analysis
self.llm_config = LlmConfig.openai(
api_key=openai_api_key,
model="gpt-4o-mini"
)
self.llm_client = LlmClient(self.llm_config)
# Document storage
self.documents = []
self.embeddings = []
self.document_index = {}
def add_documents(self, documents: List[Dict[str, str]]):
"""Add documents to the search index."""
print(f"Adding {len(documents)} documents to index...")
# Extract text for embedding
texts = [doc['content'] for doc in documents]
# Generate embeddings in batch
embeddings = self.embedding_client.embed_many(texts)
# Store documents and embeddings
start_idx = len(self.documents)
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
doc_id = start_idx + i
self.documents.append(doc)
self.embeddings.append(embedding)
self.document_index[doc_id] = {
'title': doc.get('title', f'Document {doc_id}'),
'content_preview': doc['content'][:200] + '...',
'metadata': doc.get('metadata', {})
}
print(f"Added {len(documents)} documents. Total: {len(self.documents)}")
def search(self, query: str, top_k: int = 5) -> List[Dict]:
"""Search for semantically similar documents."""
if not self.documents:
print("No documents in index")
return []
print(f"🔍 Searching for: '{query}'")
# Embed the query
query_embedding = self.embedding_client.embed(query)
# Calculate similarities
similarities = []
for i, doc_embedding in enumerate(self.embeddings):
similarity = EmbeddingClient.similarity(
query_embedding,
doc_embedding
)
similarities.append({
'doc_id': i,
'similarity': similarity,
'title': self.document_index[i]['title'],
'content_preview': self.document_index[i]['content_preview'],
'metadata': self.document_index[i]['metadata']
})
# Sort by similarity and return top-k
similarities.sort(key=lambda x: x['similarity'], reverse=True)
return similarities[:top_k]
def analyze_search_results(self, query: str, results: List[Dict]) -> str:
"""Analyze search results with LLM insights."""
if not results:
return "No results to analyze."
print("🤖 Analyzing search results with LLM...")
# Prepare context for LLM
results_text = "\n\n".join([
f"Document {i+1}: {result['title']}\n"
f"Similarity: {result['similarity']:.3f}\n"
f"Preview: {result['content_preview']}"
for i, result in enumerate(results)
])
prompt = f"""Analyze these search results for the query: "{query}"
Search Results:
{results_text}
Provide:
1. Summary of what the results reveal about the query
2. Key themes and patterns across the documents
3. Quality assessment of the search matches
4. Recommendations for refining the search or exploring related topics
5. Most relevant documents and why
Be insightful and practical in your analysis.
"""
try:
analysis = self.llm_client.complete(prompt)
return analysis
except Exception as e:
return f"Analysis failed: {str(e)}"
def compare_documents(self, doc_ids: List[int]) -> Dict:
"""Compare multiple documents for similarity."""
if len(doc_ids) < 2:
return {"error": "Need at least 2 documents to compare"}
print(f"Comparing {len(doc_ids)} documents...")
# Get embeddings for specified documents
selected_embeddings = [self.embeddings[doc_id] for doc_id in doc_ids]
selected_docs = [self.documents[doc_id] for doc_id in doc_ids]
# Calculate pairwise similarities
comparisons = []
for i in range(len(doc_ids)):
for j in range(i + 1, len(doc_ids)):
similarity = EmbeddingClient.similarity(
selected_embeddings[i],
selected_embeddings[j]
)
comparisons.append({
'doc1_id': doc_ids[i],
'doc2_id': doc_ids[j],
'doc1_title': self.document_index[doc_ids[i]]['title'],
'doc2_title': self.document_index[doc_ids[j]]['title'],
'similarity': similarity
})
# Sort by similarity
comparisons.sort(key=lambda x: x['similarity'], reverse=True)
return {
'comparisons': comparisons,
'most_similar': comparisons[0] if comparisons else None,
'least_similar': comparisons[-1] if comparisons else None,
'average_similarity': sum(c['similarity'] for c in comparisons) / len(comparisons) if comparisons else 0
}
def generate_document_summary(self, doc_id: int) -> str:
"""Generate an intelligent summary of a document."""
if doc_id >= len(self.documents):
return "Document not found"
document = self.documents[doc_id]
print(f"📝 Generating summary for: {self.document_index[doc_id]['title']}")
prompt = f"""Summarize this document concisely:
Title: {document.get('title', 'Untitled')}
Content: {document['content']}
Provide:
1. Main topics and themes
2. Key insights or findings
3. Important details or data points
4. Practical implications or takeaways
Keep the summary informative but concise (2-3 paragraphs).
"""
try:
summary = self.llm_client.complete(prompt, max_tokens=500)
return summary
except Exception as e:
return f"Summary generation failed: {str(e)}"
def get_statistics(self) -> Dict:
"""Get system statistics."""
if not self.documents:
return {"documents": 0, "embeddings": 0}
# Calculate average similarity across all documents
all_similarities = []
for i in range(len(self.embeddings)):
for j in range(i + 1, len(self.embeddings)):
similarity = EmbeddingClient.similarity(
self.embeddings[i],
self.embeddings[j]
)
all_similarities.append(similarity)
return {
"total_documents": len(self.documents),
"total_embeddings": len(self.embeddings),
"average_document_similarity": sum(all_similarities) / len(all_similarities) if all_similarities else 0,
"max_similarity": max(all_similarities) if all_similarities else 0,
"min_similarity": min(all_similarities) if all_similarities else 0
}
# Example usage
def main():
"""Demonstrate the semantic search system."""
# Set up API key
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Please set OPENAI_API_KEY environment variable")
return
# Create search system
search_system = SemanticSearchSystem(api_key)
# Sample documents
sample_documents = [
{
"title": "Introduction to Machine Learning",
"content": """Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. It focuses on developing algorithms that can automatically learn patterns from data and make predictions or decisions. The field encompasses supervised learning, unsupervised learning, and reinforcement learning approaches.""",
"metadata": {"category": "technology", "difficulty": "beginner"}
},
{
"title": "Deep Learning Neural Networks",
"content": """Deep learning uses artificial neural networks with multiple layers to model and understand complex patterns in data. These networks can automatically learn hierarchical representations of data, making them particularly effective for tasks like image recognition, natural language processing, and speech recognition. Popular architectures include convolutional neural networks and recurrent neural networks.""",
"metadata": {"category": "technology", "difficulty": "advanced"}
},
{
"title": "Sustainable Energy Solutions",
"content": """Renewable energy sources like solar, wind, and hydroelectric power are becoming increasingly important for environmental sustainability. These technologies offer clean alternatives to fossil fuels and can help reduce carbon emissions. Energy storage systems and smart grid technologies are crucial for integrating renewable energy into existing power infrastructure.""",
"metadata": {"category": "environment", "difficulty": "intermediate"}
},
{
"title": "Climate Change Impacts",
"content": """Climate change is causing significant environmental disruptions including rising sea levels, extreme weather events, and ecosystem changes. The scientific consensus indicates human activities, particularly greenhouse gas emissions, are the primary drivers. Mitigation strategies include transitioning to renewable energy, improving energy efficiency, and implementing carbon capture technologies.""",
"metadata": {"category": "environment", "difficulty": "intermediate"}
},
{
"title": "Digital Marketing Strategies",
"content": """Modern digital marketing encompasses social media marketing, search engine optimization, content marketing, and data analytics. Successful campaigns require understanding customer behavior, creating engaging content, and leveraging multiple digital channels. Personalization and automation tools are increasingly important for reaching target audiences effectively.""",
"metadata": {"category": "business", "difficulty": "beginner"}
}
]
# Add documents to index
search_system.add_documents(sample_documents)
# Perform searches
queries = [
"artificial intelligence and neural networks",
"renewable energy and sustainability",
"online marketing and social media"
]
for query in queries:
print(f"\n{'='*60}")
print(f"Query: {query}")
print('='*60)
# Search for similar documents
results = search_system.search(query, top_k=3)
# Display results
print("\n🔍 Search Results:")
for i, result in enumerate(results, 1):
print(f"{i}. {result['title']} (Similarity: {result['similarity']:.3f})")
print(f" {result['content_preview']}")
print()
# Analyze results with LLM
analysis = search_system.analyze_search_results(query, results)
print("LLM Analysis:")
print(analysis)
print()
# Compare documents
print(f"\n{'='*60}")
print("Document Comparison")
print('='*60)
comparison = search_system.compare_documents([0, 1, 2])
print("Document Similarities:")
for comp in comparison['comparisons'][:3]:
print(f"{comp['doc1_title']} <-> {comp['doc2_title']}: {comp['similarity']:.3f}")
# Generate summaries
print(f"\n{'='*60}")
print("Document Summaries")
print('='*60)
for i in range(min(2, len(sample_documents))):
summary = search_system.generate_document_summary(i)
print(f"\n📝 Summary of '{search_system.document_index[i]['title']}':")
print(summary)
# Show statistics
print(f"\n{'='*60}")
print("System Statistics")
print('='*60)
stats = search_system.get_statistics()
for key, value in stats.items():
if isinstance(value, float):
print(f"{key}: {value:.3f}")
else:
print(f"{key}: {value}")
if __name__ == "__main__":
main()
Advanced Features¶
Batch Processing with Async Operations¶
import asyncio
from graphbit import LlmConfig, LlmClient
import os
async def process_large_document_collection():
"""Process large document collections asynchronously."""
# Configure for high-throughput processing
llm_config = LlmConfig.openai(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-mini"
)
llm_client = LlmClient(llm_config, debug=False)
# Large collection of documents (simulated)
documents = [f"Document {i} content about various topics..." for i in range(100)]
# Process in batches
batch_size = 10
results = []
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
print(f"Processing batch {i//batch_size + 1}...")
# Use batch completion for efficiency
prompts = [f"Summarize this document: {doc}" for doc in batch]
try:
batch_results = await llm_client.complete_batch(
prompts,
max_tokens=200,
temperature=0.3,
max_concurrency=5
)
results.extend(batch_results)
print(f"Completed batch {i//batch_size + 1}")
except Exception as e:
print(f"Batch {i//batch_size + 1} failed: {e}")
return results
# Usage
results = asyncio.run(process_large_document_collection())
Multi-Provider Search System¶
from graphbit import init, EmbeddingConfig, EmbeddingClient, LlmConfig, LlmClient
import os
from typing import List, Dict
def create_multi_provider_system():
"""Create search system with multiple LLM providers."""
init()
# OpenAI for embeddings
embedding_config = EmbeddingConfig.openai(
api_key=os.getenv("OPENAI_API_KEY"),
model="text-embedding-3-small"
)
embedding_client = EmbeddingClient(embedding_config)
# Multiple LLM providers for analysis
providers = {
'openai': LlmClient(
LlmConfig.openai(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-mini"
)
),
'anthropic': LlmClient(
LlmConfig.anthropic(
api_key=os.getenv("ANTHROPIC_API_KEY"),
model="claude-sonnet-4-20250514"
)
) if os.getenv("ANTHROPIC_API_KEY") else None,
'ollama': LlmClient(
LlmConfig.ollama("llama3.2")
)
}
# Filter available providers
available_providers = {k: v for k, v in providers.items() if v is not None}
def analyze_with_multiple_providers(query: str, results: List[Dict]) -> Dict[str, str]:
"""Get analysis from multiple LLM providers."""
analyses = {}
prompt = f"Analyze these search results for '{query}': {results}"
for provider_name, client in available_providers.items():
try:
print(f"🤖 Getting analysis from {provider_name}...")
analysis = client.complete(prompt, max_tokens=300)
analyses[provider_name] = analysis
except Exception as e:
analyses[provider_name] = f"Error: {str(e)}"
return analyses
return embedding_client, analyze_with_multiple_providers
# Usage
embedding_client, analyzer = create_multi_provider_system()
Workflow-Based Semantic Analysis¶
from graphbit import init, LlmConfig, Executor, Workflow, Node
import os
def create_semantic_analysis_workflow():
"""Create comprehensive semantic analysis workflow."""
init()
config = LlmConfig.openai(
api_key=os.getenv("OPENAI_API_KEY"),
model="gpt-4o-mini"
)
executor = Executor(config, debug=True)
workflow = Workflow("Semantic Analysis Pipeline")
# Document Preprocessor
preprocessor = Node.agent(
name="Document Preprocessor",
prompt=f"""Preprocess this document for semantic analysis:
{document}
Tasks:
- Extract key topics and themes
- Identify important entities (people, places, concepts)
- Determine document type and structure
- Note any special formatting or data
Provide structured output for further analysis.
""",
agent_id="preprocessor"
)
# Semantic Analyzer
analyzer = Node.agent(
name="Semantic Analyzer",
prompt="""Perform semantic analysis on this preprocessed document:
Analyze:
- Semantic relationships between concepts
- Document sentiment and tone
- Key insights and findings
- Conceptual density and complexity
- Domain-specific terminology
Provide detailed semantic breakdown.
""",
agent_id="semantic_analyzer"
)
# Insight Generator
insight_generator = Node.agent(
name="Insight Generator",
prompt="""Generate actionable insights from this semantic analysis:
Create:
- Summary of key findings
- Practical implications
- Related topics for exploration
- Recommendations for further analysis
- Quality assessment of the content
Focus on useful, actionable insights.
""",
agent_id="insight_generator"
)
# Add nodes and connect
prep_id = workflow.add_node(preprocessor)
analyze_id = workflow.add_node(analyzer)
insight_id = workflow.add_node(insight_generator)
workflow.connect(prep_id, analyze_id)
workflow.connect(analyze_id, insight_id)
workflow.validate()
return executor, workflow
# Usage
executor, workflow = create_semantic_analysis_workflow()
result = executor.execute(workflow)
System Monitoring and Health¶
from graphbit import init, EmbeddingConfig, EmbeddingClient, health_check, get_system_info
import os
def monitor_semantic_search_system():
"""Monitor system health and performance."""
init()
# Check system health
health = health_check()
print("System Health:")
for key, value in health.items():
status = "Ok!" if value else "Not Ok!"
print(f" {status} {key}: {value}")
# Get system information
info = get_system_info()
print("\nSystem Information:")
for key, value in info.items():
print(f" {key}: {value}")
# Test embedding client performance
embedding_config = EmbeddingConfig.openai(
api_key=os.getenv("OPENAI_API_KEY"),
model="text-embedding-3-small"
)
try:
client = EmbeddingClient(embedding_config)
# Performance test
import time
start_time = time.time()
test_embedding = client.embed("Performance test text")
end_time = time.time()
duration = (end_time - start_time) * 1000
print(f"\n⚡ Performance Test:")
print(f" Embedding generation: {duration:.2f}ms")
print(f" Embedding dimension: {len(test_embedding)}")
return True
except Exception as e:
print(f"\nPerformance test failed: {e}")
return False
# Usage
system_healthy = monitor_semantic_search_system()
Key Benefits¶
Semantic Understanding¶
- Deep Search: Beyond keyword matching to semantic similarity
- Context Awareness: Understanding document relationships and themes
- Intelligent Analysis: LLM-powered insights and recommendations
Scalability¶
- Batch Processing: Efficient handling of large document collections
- Async Operations: Non-blocking processing for better performance
- Multiple Providers: Flexibility to use different LLM providers
Flexibility¶
- Multi-Provider Support: OpenAI, Anthropic, Ollama
- Workflow Integration: Combine with GraphBit's workflow system
- Custom Analysis: Tailored semantic analysis pipelines
This example demonstrates how GraphBit's embedding capabilities can be combined with LLM analysis to create powerful semantic search and analysis systems.