Skip to content

Document Loader

GraphBit extracts content from multiple document formats for AI workflow processing.

Components

  • DocumentLoader - Main loading class
  • DocumentLoaderConfig - Configuration options
  • DocumentContent - Extracted content and metadata

Supported Document Types

Type Description
txt Plain text files
pdf PDF documents
docx Microsoft Word documents
json JSON structured data files
csv Comma-separated values (spreadsheets)
xml XML structured data files
html HTML web pages

Quick Start

from graphbit import DocumentLoader

loader = DocumentLoader()

# Load document
content = loader.load_document("document.pdf", "pdf")
print(f"Extracted {content.content_length()} characters")

Auto-Detection

from graphbit import DocumentLoader

def load_document(file_path):
    doc_type = DocumentLoader.detect_document_type(file_path)
    if not doc_type:
        return None

    loader = DocumentLoader()
    return loader.load_document(file_path, doc_type)

Configuration

Basic Setup

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(
    max_file_size=50_000_000,    # 50MB limit
    default_encoding="utf-8",    # Text encoding
    preserve_formatting=True     # Keep formatting
)

loader = DocumentLoader(config)

Advanced Settings

from graphbit import DocumentLoaderConfig

config = DocumentLoaderConfig()
config.extraction_settings = {
    "pdf_parser": "advanced",
    "ocr_enabled": True,
    "table_detection": True
}

Properties

max_file_size

Get or set the maximum file size limit.

size = config.max_file_size

config.set_max_file_size = 100_000_000  # 100MB

default_encoding

Get or set the default text encoding.

encoding = config.default_encoding

config.set_default_encoding("utf-8")

preserve_formatting

Get or set the formatting preservation flag.

preserve = config.preserve_formatting

config.set_preserve_formatting(True)

extraction_settings

Get or set extraction settings as a dictionary.

settings = config.extraction_settings

settings = {"pdf_parser": "advanced", "ocr_enabled": True}
config.set_extraction_settings(settings)

Document Types

PDF Processing

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(preserve_formatting=True)
config.extraction_settings = {
    "ocr_enabled": True,
    "table_detection": True
}
loader = DocumentLoader(config)
content = loader.load_document("report.pdf", "pdf")

# Access metadata
metadata = content.metadata
print(f"Pages: {metadata.get('pages')}")

Text Files

from graphbit import DocumentLoaderConfig, DocumentLoader

config = DocumentLoaderConfig(default_encoding="utf-8")
loader = DocumentLoader(config)
content = loader.load_document("notes.txt", "txt")

Structured Data

from graphbit import DocumentLoader

# JSON, CSV, XML automatically parsed as text
loader = DocumentLoader()
json_content = loader.load_document("data.json", "json")
csv_content = loader.load_document("data.csv", "csv")

Static Methods

DocumentLoader.supported_types()

Get list of supported document types.

types = DocumentLoader.supported_types()
print(f"Supported formats: {types}")
# Output: ['txt', 'pdf', 'docx', 'json', 'csv', 'xml', 'html']

DocumentLoader.detect_document_type(file_path)

Detect document type from file extension.

doc_type = DocumentLoader.detect_document_type("report.pdf")
print(f"Detected type: {doc_type}")  # "pdf"

# Returns None if type cannot be detected
unknown_type = DocumentLoader.detect_document_type("file.unknown")
print(unknown_type)  # None

DocumentLoader.validate_document_source(source_path, document_type)

Validate document source and type combination.

try:
    DocumentLoader.validate_document_source("report.pdf", "pdf")
    print("Valid document source")
except Exception as e:
    print(f"Invalid: {e}")

Batch Processing

import os
from graphbit import DocumentLoader

def process_directory(directory):
    loader = DocumentLoader()
    results = []

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        doc_type = DocumentLoader.detect_document_type(file_path)

        if doc_type:
            try:
                content = loader.load_document(file_path, doc_type)
                results.append({'file': filename, 'content': content})
            except Exception as e:
                print(f"Failed {filename}: {e}")

    return results

Error Handling

import os
from graphbit import DocumentLoader, DocumentLoaderConfig

def safe_load_document(file_path, max_size=50_000_000):
    try:
        # Validate file
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

        # Check size
        if os.path.getsize(file_path) > max_size:
            raise ValueError("File too large")

        # Detect and validate type
        doc_type = DocumentLoader.detect_document_type(file_path)
        if not doc_type:
            raise ValueError("Unsupported file type")

        # Load with size limit
        config = DocumentLoaderConfig(max_file_size=max_size)
        loader = DocumentLoader(config)
        content = loader.load_document(file_path, doc_type)

        if content.is_empty():
            raise RuntimeError("No content extracted")

        return content

    except (FileNotFoundError, ValueError, RuntimeError) as e:
        print(f"Error: {e}")
        return None

Common Issues

Issue Solution
File too large Increase max_file_size in config
Encoding errors Set default_encoding="utf-8"
Empty PDF content Enable ocr_enabled=True for scanned PDFs
Unsupported format Check DocumentLoader.supported_types()

API Reference

For complete API documentation, see Python API Reference.