Document Loading¶

GraphBit extracts content from multiple document formats for AI workflow processing.

Components¶

DocumentLoader - Main loading class
DocumentLoaderConfig - Configuration options
DocumentContent - Extracted content and metadata

Supported Formats¶

PDF, DOCX, TXT, JSON, CSV, XML, HTML

Quick Start¶

from graphbit import init, DocumentLoader

init()
loader = DocumentLoader()

# Load document
content = loader.load_document("document.pdf", "pdf")
print(f"Extracted {content.content_length()} characters")

Auto-Detection¶

from graphbit import DocumentLoader

def load_document(file_path):
    doc_type = DocumentLoader.detect_document_type(file_path)
    if not doc_type:
        return None

    loader = DocumentLoader()
    return loader.load_document(file_path, doc_type)

Configuration¶

Basic Setup¶

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(
    max_file_size=50_000_000,    # 50MB limit
    default_encoding="utf-8",    # Text encoding
    preserve_formatting=True     # Keep formatting
)
loader = DocumentLoader(config)

Advanced Settings¶

from graphbit import DocumentLoaderConfig

config = DocumentLoaderConfig()
config.extraction_settings = {
    "pdf_parser": "advanced",
    "ocr_enabled": True,
    "table_detection": True
}

Document Types¶

PDF Processing¶

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(preserve_formatting=True)
config.extraction_settings = {
    "ocr_enabled": True,
    "table_detection": True
}
loader = DocumentLoader(config)
content = loader.load_document("report.pdf", "pdf")

# Access metadata
metadata = content.metadata
print(f"Pages: {metadata.get('pages')}")

Text Files¶

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(default_encoding="utf-8")
loader = DocumentLoader(config)
content = loader.load_document("notes.txt", "txt")

Structured Data¶

from graphbit import DocumentLoader

# JSON, CSV, XML automatically parsed as text
loader = DocumentLoader()
json_content = loader.load_document("data.json", "json")
csv_content = loader.load_document("data.csv", "csv")

Batch Processing¶

import os
from graphbit import DocumentLoader

def process_directory(directory):
    loader = DocumentLoader()
    results = []

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        doc_type = DocumentLoader.detect_document_type(file_path)

        if doc_type:
            try:
                content = loader.load_document(file_path, doc_type)
                results.append({'file': filename, 'content': content})
            except Exception as e:
                print(f"Failed {filename}: {e}")

    return results

Workflow Integration¶

from graphbit import Workflow, Node, Executor

# Document processing workflow
workflow = Workflow("Document Analysis")

# Add document loader node
doc_loader = Node.document_loader(
    name="PDF Loader",
    document_type="pdf",
    source_path="report.pdf"
)

# Add analysis agent
analyzer = Node.agent(
    name="Analyzer",
    prompt="Summarize: {input}"
)

# Connect and execute
loader_id = workflow.add_node(doc_loader)
analyzer_id = workflow.add_node(analyzer)
workflow.connect(loader_id, analyzer_id)

executor = Executor(llm_config)
result = executor.execute(workflow)

Error Handling¶

import os
from graphbit import DocumentLoader, DocumentLoaderConfig

def safe_load_document(file_path, max_size=50_000_000):
    try:
        # Validate file
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        # Check size
        if os.path.getsize(file_path) > max_size:
            raise ValueError("File too large")

        # Detect and validate type
        doc_type = DocumentLoader.detect_document_type(file_path)
        if not doc_type:
            raise ValueError("Unsupported file type")

        # Load with size limit
        config = DocumentLoaderConfig(max_file_size=max_size)
        loader = DocumentLoader(config)
        content = loader.load_document(file_path, doc_type)

        if content.is_empty():
            raise RuntimeError("No content extracted")

        return content

    except (FileNotFoundError, ValueError, RuntimeError) as e:
        print(f"Error: {e}")
        return None

Performance Tips¶

Memory Optimization¶

from graphbit import DocumentLoaderConfig, DocumentLoader

def memory_efficient_processing(files):
    config = DocumentLoaderConfig(max_file_size=10_000_000)
    loader = DocumentLoader(config)

    for file_path in files:
        content = loader.load_document(file_path, doc_type)
        # Process immediately, don't store
        process_content(content)
        del content  # Free memory

Batch Configuration¶

from graphbit import DocumentLoaderConfig, DocumentLoader

# Shared loader for multiple files
config = DocumentLoaderConfig(preserve_formatting=False)  # Faster
loader = DocumentLoader(config)

# Process in batches
for batch in chunked(file_list, 10):
    for file_path in batch:
        content = loader.load_document(file_path, doc_type)
        # Process batch...

Common Issues¶

Issue	Solution
File too large	Increase `max_file_size` in config
Encoding errors	Set `default_encoding="utf-8"`
Empty PDF content	Enable `ocr_enabled=True` for scanned PDFs
Unsupported format	Check `DocumentLoader.supported_types()`

API Reference¶

For complete API documentation, see Python API Reference.