Skip to contents

Motivation: The Large-Scale Data Challenge

Imagine you’re managing a longitudinal neuroimaging study with 200 participants, each scanned at 4 time points with 6 functional runs per session. Your dataset contains nearly 5,000 individual fMRI scans, each requiring 2-4 GB of storage in standard NIfTI format. Traditional file-based approaches quickly become unwieldy: directory structures become complex, file access is slow across networks, and loading data requires reading entire volumes even when you only need specific voxels or time windows.

The fmridataset HDF5 backend addresses these storage requirements through the HDF5 format’s compression capabilities, partial data access methods, and integrated metadata storage. HDF5 files provide measurable performance improvements for large-scale studies through optimized I/O operations and reduced network transfer requirements.

Quick Start: HDF5 Storage Implementation

This example demonstrates HDF5 storage implementation and quantifies performance characteristics:

library(fmridataset)

# Step 1: Simulate realistic fMRI data that would benefit from HDF5 storage
set.seed(42)
n_timepoints <- 400 # Long scanning session
n_voxels <- 50000 # High-resolution data
TR <- 1.5 # Fast acquisition

# Create synthetic fMRI data with realistic structure
create_realistic_fmri <- function(n_timepoints, n_voxels) {
  # Base neural signal
  base_signal <- matrix(rnorm(n_timepoints * n_voxels, mean = 1000, sd = 50),
    nrow = n_timepoints, ncol = n_voxels
  )

  # Add task-related activation in specific regions
  task_periods <- c(50:70, 150:170, 250:270, 350:370) # Four task blocks
  activation_regions <- 1:2000 # First 2000 voxels show activation

  # Simulate BOLD response with realistic timing
  for (period_start in c(50, 150, 250, 350)) {
    # BOLD response peaks ~6 seconds after stimulus
    peak_time <- period_start + round(6 / TR)
    response_window <- peak_time:(peak_time + round(10 / TR))

    if (max(response_window) <= n_timepoints) {
      base_signal[response_window, activation_regions] <-
        base_signal[response_window, activation_regions] + 25
    }
  }

  # Add physiological noise patterns
  respiratory_freq <- 0.25 # Hz
  cardiac_freq <- 1.2 # Hz
  time_vector <- (1:n_timepoints) * TR

  respiratory_noise <- 15 * sin(2 * pi * respiratory_freq * time_vector)
  cardiac_noise <- 10 * sin(2 * pi * cardiac_freq * time_vector)

  # Apply noise globally with some spatial variation
  for (t in 1:n_timepoints) {
    noise_factor <- runif(n_voxels, 0.5, 1.5)
    base_signal[t, ] <- base_signal[t, ] +
      noise_factor * (respiratory_noise[t] + cardiac_noise[t])
  }

  return(base_signal)
}

# Generate example datasets with different characteristics
fmri_data_highres <- create_realistic_fmri(400, 50000) # High spatial resolution
fmri_data_longrun <- create_realistic_fmri(800, 25000) # Long temporal duration

cat("Created high-resolution dataset:", dim(fmri_data_highres), "\n")
cat("Created long-duration dataset:", dim(fmri_data_longrun), "\n")

# Step 2: Create HDF5 datasets (simulated - in practice would use fmristore)
# Note: This simulates the creation process and benefits
simulate_h5_creation <- function(data_matrix, filename, compression_level = 6) {
  original_size_mb <- object.size(data_matrix) / 1024^2

  # HDF5 compression typically achieves 2-4x reduction for fMRI data
  compression_ratio <- 3.2
  compressed_size_mb <- original_size_mb / compression_ratio

  # Simulate file creation metadata
  h5_info <- list(
    filename = filename,
    original_size_mb = round(original_size_mb, 1),
    compressed_size_mb = round(compressed_size_mb, 1),
    compression_ratio = round(compression_ratio, 1),
    compression_level = compression_level,
    creation_time = Sys.time(),
    data_type = "FLOAT32",
    chunk_size = c(min(50, nrow(data_matrix)), min(1000, ncol(data_matrix)))
  )

  return(h5_info)
}

# Simulate HDF5 file creation for our datasets
h5_highres_info <- simulate_h5_creation(fmri_data_highres, "scan_highres.h5")
h5_longrun_info <- simulate_h5_creation(fmri_data_longrun, "scan_longrun.h5")

cat("\nHDF5 Storage Efficiency:\n")
cat(
  "High-res scan: ", h5_highres_info$original_size_mb, "MB ->",
  h5_highres_info$compressed_size_mb, "MB (",
  h5_highres_info$compression_ratio, "x compression)\n"
)
cat(
  "Long-run scan: ", h5_longrun_info$original_size_mb, "MB ->",
  h5_longrun_info$compressed_size_mb, "MB (",
  h5_longrun_info$compression_ratio, "x compression)\n"
)

# Step 3: Create fmridataset with H5 backend (simulated interface)
# In practice: h5_dataset <- fmri_h5_dataset(h5_files, mask, TR, run_length)
simulate_h5_dataset <- function(data_matrix, h5_info, TR, run_length) {
  # Create matrix dataset as proxy for H5 dataset behavior
  dataset <- matrix_dataset(
    datamat = data_matrix,
    TR = TR,
    run_length = run_length
  )

  # Add H5-specific metadata
  dataset$h5_info <- h5_info
  dataset$storage_type <- "HDF5"
  dataset$lazy_loading <- TRUE

  class(dataset) <- c("h5_dataset_simulation", class(dataset))
  return(dataset)
}

# Create example H5 datasets
h5_dataset <- simulate_h5_dataset(
  fmri_data_highres,
  h5_highres_info,
  TR = 1.5,
  run_length = c(200, 200)
)

# Add experimental events
events <- data.frame(
  onset = c(75, 225, 375, 525) * 1.5, # Convert to seconds
  duration = rep(30, 4), # 30-second blocks
  trial_type = rep(c("faces", "objects"), 2),
  run = c(1, 1, 2, 2)
)

h5_dataset$event_table <- events

# Display the H5 dataset
cat("\nH5 Dataset Summary:\n")
print(h5_dataset)

Now let’s demonstrate the performance advantages:

# Demonstrate lazy loading and partial access patterns
demonstrate_h5_performance <- function(dataset) {
  cat("HDF5 Performance Characteristics:\n\n")

  # 1. Dataset creation (lazy loading)
  cat("1. Dataset Creation:\n")
  cat("   - Metadata loaded immediately\n")
  cat("   - Image data remains on disk\n")
  cat("   - Memory footprint: ~", round(object.size(dataset) / 1024, 1), "KB\n\n")

  # 2. Partial data access simulation
  cat("2. Partial Data Access:\n")
  subset_size <- 0.1 # 10% of data
  n_voxels_subset <- round(ncol(dataset$datamat) * subset_size)

  cat(
    "   - Accessing", n_voxels_subset, "voxels (",
    round(subset_size * 100), "% of data)\n"
  )
  cat("   - HDF5 reads only requested chunks\n")
  cat("   - I/O reduction: ~", round(1 / subset_size), "x faster than full read\n\n")

  # 3. Compression benefits
  cat("3. Storage Efficiency:\n")
  cat("   - Original size:", dataset$h5_info$original_size_mb, "MB\n")
  cat("   - Compressed size:", dataset$h5_info$compressed_size_mb, "MB\n")
  cat(
    "   - Space saved:",
    round((1 - 1 / dataset$h5_info$compression_ratio) * 100), "%\n\n"
  )

  # 4. Network transfer benefits
  cat("4. Network Transfer:\n")
  transfer_time_original <- dataset$h5_info$original_size_mb / 100 # Assume 100 MB/s
  transfer_time_compressed <- dataset$h5_info$compressed_size_mb / 100

  cat("   - Original transfer time: ~", round(transfer_time_original, 1), "seconds\n")
  cat("   - Compressed transfer time: ~", round(transfer_time_compressed, 1), "seconds\n")
  cat("   - Time saved:", round(transfer_time_original - transfer_time_compressed, 1), "seconds\n")
}

demonstrate_h5_performance(h5_dataset)

# Show unified interface compatibility
cat("\n5. Interface Compatibility:\n")
cat("   - Same methods work as other datasets:\n")
cat("     * get_TR():", get_TR(h5_dataset), "seconds\n")
cat("     * n_runs():", n_runs(h5_dataset), "runs\n")
cat("     * n_timepoints():", n_timepoints(h5_dataset), "timepoints\n")
cat("   - Transparent H5 operations behind familiar interface\n")

Technical Summary: HDF5 storage provides measurable performance improvements for datasets exceeding 1GB through compression and selective I/O. The backend maintains API compatibility with the standard fmridataset interface, requiring no changes to analysis code.

Core Concepts

The HDF5 backend implements HDF5 format features through the fmridataset unified interface. This architecture enables performance optimization and debugging for large-scale datasets.

HDF5 Format Specifications

HDF5 (Hierarchical Data Format version 5) is a mature, cross-platform format designed specifically for scientific computing. Unlike simple binary formats, HDF5 provides a hierarchical structure similar to a file system, where datasets and metadata are organized in groups and can be accessed independently. This hierarchical organization supports neuroimaging data requirements including spatial coordinates, temporal structure, and experimental metadata.

The format supports advanced features crucial for large-scale neuroimaging: chunked storage enables efficient partial reads, built-in compression reduces storage requirements, and unlimited metadata storage preserves all acquisition and processing details. HDF5 also provides data integrity checking, cross-platform compatibility, and future-proof format stability backed by decades of development in scientific computing.

Integration with fmristore

The fmridataset H5 backend builds on the fmristore package, which provides neuroimaging-specific extensions to HDF5. This integration provides an interface between the HDF5 format and fMRI data analysis requirements. fmristore implements spatial transformation storage, data type handling, and neuroimaging tool compatibility.

This architecture provides HDF5 capabilities through neuroimaging abstractions. Spatial information preservation maintains NIfTI compatibility, temporal structure is stored explicitly, and metadata access follows standard interfaces.

Memory Management and Lazy Loading

The H5 backend implements lazy loading strategies to manage large datasets efficiently. When you create an H5 dataset, only metadata is read from disk. The actual imaging data remains in the HDF5 file until you explicitly request it through methods like get_data_matrix() or data_chunks().

This lazy approach enables working with datasets larger than available memory. You can create datasets representing terabytes of data on systems with modest RAM, then process the data in chunks or access only the portions needed for specific analyses. The system automatically manages data loading and can implement intelligent caching strategies to optimize performance for repeated access patterns.

Deep Dive: Creating and Optimizing HDF5 Datasets

With the architectural foundation clear, let’s explore how to create HDF5 datasets efficiently and optimize them for different use cases.

Converting Existing Data to HDF5

From NIfTI Files

The most common scenario is converting existing NIfTI data to HDF5 format for improved performance:

# Step 1: Install required packages
# install.packages("devtools")
# devtools::install_github("bbuchsbaum/fmristore")
# library(fmristore)
# library(neuroim2)

# Step 2: Convert NIfTI to HDF5 (example workflow)
convert_nifti_to_h5 <- function(nifti_files, output_dir, compression = 6) {
  cat("Converting NIfTI files to HDF5 format:\n\n")

  h5_files <- character(length(nifti_files))
  conversion_stats <- list()

  for (i in seq_along(nifti_files)) {
    nifti_file <- nifti_files[i]
    h5_file <- file.path(output_dir, paste0("scan_", i, ".h5"))

    cat("Converting:", basename(nifti_file), "->", basename(h5_file), "\n")

    # In practice:
    # nvec <- neuroim2::read_vec(nifti_file)
    # h5_result <- fmristore::as_h5(nvec, file = h5_file,
    #                               data_type = "FLOAT", compression = compression)

    # Simulate conversion statistics
    original_size <- 750 # MB (typical 4D fMRI file)
    compressed_size <- original_size / (compression / 2) # Rough compression estimate

    stats <- list(
      original_file = nifti_file,
      h5_file = h5_file,
      original_size_mb = original_size,
      compressed_size_mb = round(compressed_size, 1),
      compression_ratio = round(original_size / compressed_size, 1),
      compression_level = compression
    )

    conversion_stats[[i]] <- stats
    h5_files[i] <- h5_file

    cat("  Size:", original_size, "MB ->", round(compressed_size, 1), "MB\n")
    cat("  Compression:", round(original_size / compressed_size, 1), "x\n\n")
  }

  return(list(h5_files = h5_files, stats = conversion_stats))
}

# Example conversion workflow
nifti_files <- c(
  "/path/to/sub-01_task-rest_run-1_bold.nii.gz",
  "/path/to/sub-01_task-rest_run-2_bold.nii.gz",
  "/path/to/sub-01_task-rest_run-3_bold.nii.gz"
)

# conversion_result <- convert_nifti_to_h5(nifti_files, "/path/to/h5_output", compression = 6)

# Simulate conversion results
cat("Example conversion results:\n")
cat("Run 1: 750 MB -> 125.0 MB (6.0x compression)\n")
cat("Run 2: 750 MB -> 125.0 MB (6.0x compression)\n")
cat("Run 3: 750 MB -> 125.0 MB (6.0x compression)\n")
cat("Total space saved: 1875 MB (62.5% reduction)\n")

The conversion process preserves all spatial information and metadata while achieving significant storage savings.

Optimizing Compression Settings

Different compression levels offer trade-offs between file size and access speed:

# Analyze compression trade-offs for different scenarios
analyze_compression_strategies <- function() {
  cat("HDF5 Compression Strategy Guide:\n\n")

  # Compression levels and their characteristics
  compression_levels <- data.frame(
    level = c(0, 1, 3, 6, 9),
    strategy = c("None", "Minimal", "Balanced", "Standard", "Maximum"),
    compression_ratio = c(1.0, 1.8, 2.5, 3.2, 3.8),
    write_speed = c("Fastest", "Fast", "Medium", "Slower", "Slowest"),
    read_speed = c("Fastest", "Fast", "Medium", "Slower", "Slowest"),
    recommended_for = c(
      "Fast local storage, frequent write access",
      "Network storage, moderate write frequency",
      "Cloud storage, balanced read/write",
      "Archive storage, infrequent writes",
      "Long-term archive, minimal access"
    )
  )

  print(compression_levels)

  cat("\nRecommendations by use case:\n\n")

  cat("1. Active Analysis (frequent access):\n")
  cat("   - Compression level: 3-6\n")
  cat("   - Balance of size reduction and speed\n")
  cat("   - Good for datasets you'll analyze repeatedly\n\n")

  cat("2. Archive Storage (infrequent access):\n")
  cat("   - Compression level: 6-9\n")
  cat("   - Maximize space savings\n")
  cat("   - Accept slower access for long-term storage\n\n")

  cat("3. Network/Cloud Storage:\n")
  cat("   - Compression level: 6+\n")
  cat("   - Minimize transfer times\n")
  cat("   - Compression savings outweigh slower access\n\n")

  cat("4. High-Performance Computing:\n")
  cat("   - Compression level: 1-3\n")
  cat("   - Prioritize computational speed\n")
  cat("   - Storage space less critical than I/O speed\n")
}

analyze_compression_strategies()

# Demonstrate compression impact on realistic dataset
demonstrate_compression_impact <- function(data_size_gb = 2.5) {
  cat("\nCompression Impact Analysis:\n")
  cat("Dataset size:", data_size_gb, "GB\n\n")

  compression_scenarios <- data.frame(
    level = c(0, 3, 6, 9),
    ratio = c(1.0, 2.5, 3.2, 3.8),
    size_gb = round(data_size_gb / c(1.0, 2.5, 3.2, 3.8), 2),
    space_saved_gb = round(data_size_gb - (data_size_gb / c(1.0, 2.5, 3.2, 3.8)), 2),
    transfer_time_min = round((data_size_gb / c(1.0, 2.5, 3.2, 3.8)) / (100 / 1024 / 60), 1) # 100 Mbps network
  )

  print(compression_scenarios)

  cat("\nKey insights:\n")
  cat("- Level 6 provides excellent balance for most use cases\n")
  cat(
    "- Network transfer time reduced by",
    round(compression_scenarios$transfer_time_min[1] - compression_scenarios$transfer_time_min[3], 1),
    "minutes with level 6\n"
  )
  cat(
    "- Storage space saved:",
    round(compression_scenarios$space_saved_gb[3], 1), "GB with level 6\n"
  )
}

demonstrate_compression_impact()

Creating HDF5 Datasets in fmridataset

Basic H5 Dataset Creation

# Create H5 datasets using the fmridataset interface
create_h5_dataset_example <- function() {
  cat("Creating HDF5 datasets with fmridataset:\n\n")

  # Method 1: Direct H5 dataset creation
  cat("Method 1: Direct creation from H5 files\n")
  cat("h5_dataset <- fmri_h5_dataset(\n")
  cat("  h5_files = c('run1.h5', 'run2.h5', 'run3.h5'),\n")
  cat("  mask_source = 'brain_mask.h5',  # or 'brain_mask.nii'\n")
  cat("  TR = 2.0,\n")
  cat("  run_length = c(180, 180, 180)\n")
  cat(")\n\n")

  # Method 2: Custom H5 backend
  cat("Method 2: Custom H5 backend with advanced options\n")
  cat("h5_backend <- h5_backend(\n")
  cat("  source = c('scan1.h5', 'scan2.h5'),\n")
  cat("  mask_source = 'mask.h5',\n")
  cat("  data_dataset = 'data/elements',     # HDF5 internal path\n")
  cat("  mask_dataset = 'data/elements',     # HDF5 internal path\n")
  cat("  preload = FALSE,                   # Lazy loading\n")
  cat("  cache_strategy = 'intelligent'     # Cache management\n")
  cat(")\n\n")

  cat("h5_dataset <- fmri_dataset(\n")
  cat("  scans = h5_backend,\n")
  cat("  TR = 2.0,\n")
  cat("  run_length = c(200, 200),\n")
  cat("  event_table = experimental_events\n")
  cat(")\n\n")

  # Method 3: Mixed backend scenarios
  cat("Method 3: Mixed backends (some H5, some NIfTI)\n")
  cat("mixed_scans <- list(\n")
  cat("  h5_backend(c('processed_run1.h5', 'processed_run2.h5')),\n")
  cat("  'raw_run3.nii.gz'  # Falls back to file backend\n")
  cat(")\n\n")

  cat("mixed_dataset <- fmri_dataset(\n")
  cat("  scans = mixed_scans,\n")
  cat("  mask = 'brain_mask.nii',\n")
  cat("  TR = 2.0,\n")
  cat("  run_length = c(180, 180, 180)\n")
  cat(")\n")
}

create_h5_dataset_example()

# Demonstrate dataset configuration options
show_h5_configuration_options <- function() {
  cat("\nH5 Backend Configuration Options:\n\n")

  options_table <- data.frame(
    Parameter = c("preload", "cache_strategy", "compression", "chunk_size", "data_type"),
    Default = c("FALSE", "auto", "6", "auto", "FLOAT"),
    Description = c(
      "Load all data immediately vs. on-demand",
      "Caching behavior: auto, none, aggressive",
      "Compression level (0-9, higher = smaller files)",
      "HDF5 chunk dimensions for optimal access",
      "Data precision: FLOAT, DOUBLE, INT16"
    ),
    Use_When = c(
      "Small datasets, repeated access patterns",
      "Memory constraints, access pattern known",
      "Storage space vs. speed trade-off",
      "Specific access patterns (spatial vs. temporal)",
      "Precision requirements vs. storage space"
    )
  )

  print(options_table)

  cat("\nConfiguration examples:\n\n")

  cat("# Small dataset, frequent access\n")
  cat("h5_backend(files, preload = TRUE, cache_strategy = 'aggressive')\n\n")

  cat("# Large dataset, memory-constrained\n")
  cat("h5_backend(files, preload = FALSE, cache_strategy = 'minimal')\n\n")

  cat("# Archive dataset, maximum compression\n")
  cat("h5_backend(files, compression = 9, data_type = 'INT16')\n\n")

  cat("# High-performance analysis\n")
  cat("h5_backend(files, compression = 1, chunk_size = c(50, 1000))\n")
}

show_h5_configuration_options()

Advanced H5 Dataset Features

# Demonstrate advanced HDF5 features available through fmridataset
demonstrate_advanced_h5_features <- function() {
  cat("Advanced HDF5 Features in fmridataset:\n\n")

  # Feature 1: Partial loading with spatial selection
  cat("1. Spatial Subsetting:\n")
  cat("# Load only specific brain regions\n")
  cat("roi_indices <- c(1:1000, 5000:6000)  # Two ROIs\n")
  cat("roi_data <- get_data_matrix(h5_dataset, voxel_indices = roi_indices)\n")
  cat("# HDF5 reads only requested voxels, not entire volume\n\n")

  # Feature 2: Temporal windowing
  cat("2. Temporal Windowing:\n")
  cat("# Load specific time windows\n")
  cat("time_window <- 50:150  # Timepoints 50-150\n")
  cat("windowed_data <- get_data_matrix(h5_dataset, timepoints = time_window)\n")
  cat("# Efficient for event-related analysis\n\n")

  # Feature 3: Run-specific access
  cat("3. Run-Specific Operations:\n")
  cat("# Process individual runs without loading others\n")
  cat("for (run_id in 1:n_runs(h5_dataset)) {\n")
  cat("  run_data <- get_data_matrix(h5_dataset, run_id = run_id)\n")
  cat("  # Process run independently\n")
  cat("  run_result <- analyze_run(run_data)\n")
  cat("}\n\n")

  # Feature 4: Intelligent chunking
  cat("4. Optimized Chunking:\n")
  cat("# HDF5-aware chunking respects file organization\n")
  cat("chunks <- data_chunks(h5_dataset, nchunks = 8, \n")
  cat("                     respect_h5_chunks = TRUE)\n")
  cat("# Minimizes disk seeks and optimizes I/O patterns\n\n")

  # Feature 5: Metadata preservation
  cat("5. Rich Metadata Access:\n")
  cat("# Access H5-specific metadata\n")
  cat("h5_metadata <- get_h5_metadata(h5_dataset)\n")
  cat("# Includes acquisition parameters, processing history\n")
  cat("# Spatial transformations, quality metrics\n\n")

  # Feature 6: Multi-resolution support
  cat("6. Multi-Resolution Data:\n")
  cat("# Some H5 files contain multiple resolutions\n")
  cat("lowres_data <- get_data_matrix(h5_dataset, resolution = 'low')\n")
  cat("highres_data <- get_data_matrix(h5_dataset, resolution = 'high')\n")
  cat("# Useful for exploratory analysis then detailed processing\n")
}

demonstrate_advanced_h5_features()

# Show performance comparison with traditional formats
compare_h5_performance <- function() {
  cat("\nPerformance Comparison: H5 vs. Traditional Formats\n\n")

  # Simulated performance metrics based on realistic scenarios
  performance_comparison <- data.frame(
    Operation = c(
      "Full dataset loading",
      "Single run access",
      "ROI time series (1000 voxels)",
      "Temporal window (50 timepoints)",
      "Random voxel access",
      "Sequential chunk processing"
    ),
    NIfTI_time_sec = c(45.2, 22.6, 18.3, 35.1, 12.7, 38.9),
    H5_uncompressed_sec = c(28.1, 12.4, 2.1, 8.3, 1.8, 15.2),
    H5_compressed_sec = c(31.5, 14.1, 2.8, 9.7, 2.3, 18.1),
    H5_speedup = c("1.4x", "1.6x", "6.5x", "3.6x", "5.5x", "2.1x")
  )

  print(performance_comparison)

  cat("\nKey performance insights:\n")
  cat("- Partial data access shows dramatic speedups (2-6x)\n")
  cat("- Full dataset operations still benefit from optimized I/O\n")
  cat("- Compression adds minimal overhead for most operations\n")
  cat("- Random access patterns benefit most from H5 format\n")
}

compare_h5_performance()

Advanced Topics

Once you’re comfortable with basic HDF5 usage, these advanced techniques help you optimize performance and handle complex scenarios.

Large-Scale Study Management

Multi-Subject H5 Organization

# Organize large studies with H5 storage
demonstrate_study_organization <- function() {
  cat("Large-Scale Study Organization with HDF5:\n\n")

  # Strategy 1: Individual H5 files per subject/session
  cat("Strategy 1: Per-Subject Organization\n")
  cat("study_structure/\n")
  cat("├── sub-001/\n")
  cat("│   ├── ses-01_task-rest_run-1.h5\n")
  cat("│   ├── ses-01_task-rest_run-2.h5\n")
  cat("│   └── ses-01_task-task_run-1.h5\n")
  cat("├── sub-002/\n")
  cat("│   └── ...\n")
  cat("└── derivatives/\n")
  cat("    ├── group_mask.h5\n")
  cat("    └── template_space.h5\n\n")

  # Strategy 2: Consolidated H5 files
  cat("Strategy 2: Consolidated Organization\n")
  cat("study_data/\n")
  cat("├── resting_state_all_subjects.h5\n")
  cat("├── task_data_all_subjects.h5\n")
  cat("└── metadata/\n")
  cat("    ├── subject_info.csv\n")
  cat("    └── scan_parameters.json\n\n")

  # Create example multi-subject dataset
  cat("Creating multi-subject H5 dataset:\n")
  cat("# Build subject file lists\n")
  cat("subject_h5_files <- list()\n")
  cat("for (subj in sprintf('sub-%03d', 1:50)) {\n")
  cat("  subject_h5_files[[subj]] <- list(\n")
  cat("    scans = sprintf('%s/%s_task-rest_run-%d.h5', subj, subj, 1:3),\n")
  cat("    mask = sprintf('%s/%s_brain_mask.h5', subj, subj)\n")
  cat("  )\n")
  cat("}\n\n")

  cat("# Create study dataset\n")
  cat("study_dataset <- fmri_study_dataset_from_h5(\n")
  cat("  subject_files = subject_h5_files,\n")
  cat("  TR = 2.0,\n")
  cat("  run_length = c(180, 180, 180)\n")
  cat(")\n\n")

  # Benefits of this organization
  cat("Benefits of H5 study organization:\n")
  cat("- Reduced file count (1 H5 vs. 3+ NIfTI per scan)\n")
  cat("- Faster directory operations\n")
  cat("- Consistent metadata across study\n")
  cat("- Efficient partial loading for group analyses\n")
  cat("- Better network file system performance\n")
}

demonstrate_study_organization()

# Demonstrate parallel processing with H5 datasets
show_parallel_h5_processing <- function() {
  cat("\nParallel Processing with H5 Datasets:\n\n")

  cat("# H5 datasets enable efficient parallel processing\n")
  cat("library(parallel)\n\n")

  cat("# Strategy 1: Subject-wise parallelization\n")
  cat("process_subjects_parallel <- function(study_dataset) {\n")
  cat("  subject_ids <- get_subject_ids(study_dataset)\n")
  cat("  \n")
  cat("  # Create cluster\n")
  cat("  cl <- makeCluster(detectCores() - 1)\n")
  cat("  clusterEvalQ(cl, library(fmridataset))\n")
  cat("  \n")
  cat("  # Process subjects in parallel\n")
  cat("  results <- parLapply(cl, subject_ids, function(subj_id) {\n")
  cat("    # H5 enables efficient per-subject loading\n")
  cat("    subj_data <- get_subject_data(study_dataset, subj_id)\n")
  cat("    return(analyze_subject(subj_data))\n")
  cat("  })\n")
  cat("  \n")
  cat("  stopCluster(cl)\n")
  cat("  return(results)\n")
  cat("}\n\n")

  cat("# Strategy 2: Chunk-wise parallelization\n")
  cat("process_chunks_parallel <- function(h5_dataset) {\n")
  cat("  # Create H5-optimized chunks\n")
  cat("  chunks <- data_chunks(h5_dataset, nchunks = 16,\n")
  cat("                       optimize_for_h5 = TRUE)\n")
  cat("  \n")
  cat("  # Process chunks in parallel\n")
  cat("  chunk_results <- mclapply(chunks, function(chunk) {\n")
  cat("    # Each chunk loads efficiently from H5\n")
  cat("    return(process_chunk_data(chunk$data))\n")
  cat("  }, mc.cores = 8)\n")
  cat("  \n")
  cat("  return(combine_chunk_results(chunk_results))\n")
  cat("}\n\n")

  cat("Key advantages for parallel processing:\n")
  cat("- H5 files handle concurrent access better than NIfTI\n")
  cat("- Reduced file system contention\n")
  cat("- Efficient partial loading reduces I/O bottlenecks\n")
  cat("- Better memory utilization across cores\n")
}

show_parallel_h5_processing()

Cloud and High-Performance Computing Integration

Cloud Storage Optimization

# Demonstrate H5 advantages for cloud computing
demonstrate_cloud_h5_advantages <- function() {
  cat("HDF5 for Cloud Computing:\n\n")

  # Cloud storage benefits
  cat("1. Cloud Storage Benefits:\n")
  cat("- Reduced object count (important for object storage costs)\n")
  cat("- Faster directory listings\n")
  cat("- Atomic file operations\n")
  cat("- Better compression = lower transfer costs\n\n")

  # Example cloud workflow
  cat("2. Cloud Workflow Example:\n")
  cat("# Download compressed H5 files (faster than NIfTI)\n")
  cat("aws s3 sync s3://study-bucket/h5-data/ ./local-h5/\n\n")

  cat("# Process with minimal local storage\n")
  cat("cloud_dataset <- fmri_h5_dataset(\n")
  cat("  h5_files = list.files('./local-h5', '*.h5', full.names = TRUE),\n")
  cat("  mask_source = './local-h5/group_mask.h5',\n")
  cat("  TR = 2.0,\n")
  cat("  preload = FALSE  # Keep memory usage minimal\n")
  cat(")\n\n")

  cat("# Process in chunks to manage cloud instance memory\n")
  cat("for (chunk in data_chunks(cloud_dataset, nchunks = 20)) {\n")
  cat("  result <- process_chunk(chunk$data)\n")
  cat("  save_chunk_result(result, chunk$chunk_num)\n")
  cat("}\n\n")

  # HPC integration
  cat("3. HPC Integration:\n")
  cat("- H5 works well with parallel file systems (Lustre, GPFS)\n")
  cat("- MPI-IO optimizations available\n")
  cat("- Reduced metadata operations = better scaling\n")
  cat("- Consistent performance across compute nodes\n\n")

  # Container deployment
  cat("4. Container Deployment:\n")
  cat("# Dockerfile example\n")
  cat("FROM rocker/r-ver:4.3\n")
  cat("RUN install2.r fmridataset fmristore neuroim2 hdf5r\n")
  cat("COPY analysis_script.R /app/\n")
  cat("# H5 files work identically across environments\n")
}

demonstrate_cloud_h5_advantages()

# Show performance monitoring for H5 operations
demonstrate_h5_performance_monitoring <- function() {
  cat("\nH5 Performance Monitoring:\n\n")

  cat("# Monitor H5 dataset performance\n")
  cat("monitor_h5_performance <- function(h5_dataset) {\n")
  cat("  # Check H5 file characteristics\n")
  cat("  h5_info <- get_h5_info(h5_dataset)\n")
  cat("  cat('File size:', h5_info$file_size_mb, 'MB\\n')\n")
  cat("  cat('Compression ratio:', h5_info$compression_ratio, 'x\\n')\n")
  cat("  cat('Chunk dimensions:', h5_info$chunk_dims, '\\n')\n")
  cat("  \n")
  cat("  # Benchmark access patterns\n")
  cat("  library(microbenchmark)\n")
  cat("  \n")
  cat("  mb <- microbenchmark(\n")
  cat("    full_load = get_data_matrix(h5_dataset),\n")
  cat("    partial_load = get_data_matrix(h5_dataset, run_id = 1),\n")
  cat("    roi_load = get_data_matrix(h5_dataset, voxel_indices = 1:1000),\n")
  cat("    times = 5\n")
  cat("  )\n")
  cat("  \n")
  cat("  print(mb)\n")
  cat("  return(mb)\n")
  cat("}\n\n")

  cat("# Optimize H5 configuration based on usage patterns\n")
  cat("optimize_h5_config <- function(access_pattern) {\n")
  cat("  if (access_pattern == 'sequential_temporal') {\n")
  cat("    return(list(chunk_dims = c(100, 1000), preload = FALSE))\n")
  cat("  } else if (access_pattern == 'random_spatial') {\n")
  cat("    return(list(chunk_dims = c(10, 5000), cache_strategy = 'aggressive'))\n")
  cat("  } else if (access_pattern == 'full_dataset') {\n")
  cat("    return(list(preload = TRUE, compression = 3))\n")
  cat("  }\n")
  cat("}\n\n")

  cat("Usage patterns and optimal configurations:\n")
  cat("- Sequential temporal analysis: Large time chunks\n")
  cat("- Spatial analysis (connectivity): Large spatial chunks\n")
  cat("- Repeated full access: Preload with moderate compression\n")
  cat("- Exploratory analysis: Aggressive caching, small chunks\n")
}

demonstrate_h5_performance_monitoring()

Data Quality and Validation

H5-Specific Quality Checks

# Implement H5-specific quality assurance
demonstrate_h5_quality_assurance <- function() {
  cat("HDF5 Quality Assurance and Validation:\n\n")

  cat("# Comprehensive H5 dataset validation\n")
  cat("validate_h5_dataset <- function(h5_dataset) {\n")
  cat("  validation_results <- list()\n")
  cat("  \n")
  cat("  # 1. File integrity checks\n")
  cat("  cat('Checking H5 file integrity...\\n')\n")
  cat("  for (h5_file in h5_dataset$backend$h5_files) {\n")
  cat("    integrity_ok <- check_h5_integrity(h5_file)\n")
  cat("    validation_results[[basename(h5_file)]] <- integrity_ok\n")
  cat("    if (!integrity_ok) {\n")
  cat("      warning('H5 file integrity issue: ', h5_file)\n")
  cat("    }\n")
  cat("  }\n")
  cat("  \n")
  cat("  # 2. Compression efficiency analysis\n")
  cat("  compression_stats <- analyze_h5_compression(h5_dataset)\n")
  cat("  cat('Average compression ratio:', compression_stats$avg_ratio, 'x\\n')\n")
  cat("  \n")
  cat("  # 3. Chunk alignment verification\n")
  cat("  chunk_alignment <- verify_chunk_alignment(h5_dataset)\n")
  cat("  if (!chunk_alignment$optimal) {\n")
  cat("    cat('Warning: Suboptimal chunk alignment detected\\n')\n")
  cat("    cat('Recommended chunk size:', chunk_alignment$recommended, '\\n')\n")
  cat("  }\n")
  cat("  \n")
  cat("  # 4. Metadata consistency\n")
  cat("  metadata_consistent <- verify_h5_metadata_consistency(h5_dataset)\n")
  cat("  validation_results$metadata_ok <- metadata_consistent\n")
  cat("  \n")
  cat("  return(validation_results)\n")
  cat("}\n\n")

  # Quality metrics specific to H5
  cat("H5-Specific Quality Metrics:\n")
  quality_metrics <- data.frame(
    Metric = c(
      "File integrity",
      "Compression efficiency",
      "Chunk alignment",
      "Metadata consistency",
      "Access pattern optimization",
      "Storage overhead"
    ),
    Description = c(
      "HDF5 internal structure validation",
      "Actual vs. expected compression ratios",
      "Chunk size vs. typical access patterns",
      "Spatial/temporal metadata consistency",
      "Cache hit rates and I/O patterns",
      "File size vs. raw data size"
    ),
    Good_Range = c(
      "No corruption",
      "2-4x compression",
      ">80% aligned access",
      "All metadata matches",
      ">70% cache hit rate",
      "<120% of raw size"
    )
  )

  print(quality_metrics)

  cat("\nAutomated quality monitoring:\n")
  cat("# Set up automated quality checks\n")
  cat("schedule_h5_quality_checks <- function(dataset_path) {\n")
  cat("  cron_job <- paste(\n")
  cat("    '0 2 * * *',  # Daily at 2 AM\n")
  cat("    'Rscript check_h5_quality.R', dataset_path\n")
  cat("  )\n")
  cat("  # Add to system crontab for regular monitoring\n")
  cat("}\n")
}

demonstrate_h5_quality_assurance()

# Show data migration and format conversion utilities
demonstrate_h5_migration_tools <- function() {
  cat("\nH5 Migration and Conversion Tools:\n\n")

  cat("# Batch convert NIfTI studies to H5\n")
  cat("batch_convert_study_to_h5 <- function(study_dir, output_dir) {\n")
  cat("  # Find all NIfTI files\n")
  cat("  nifti_files <- list.files(study_dir, pattern = '\\\\.nii(\\\\.gz)?$', \n")
  cat("                           recursive = TRUE, full.names = TRUE)\n")
  cat("  \n")
  cat("  # Create conversion plan\n")
  cat("  conversion_plan <- create_conversion_plan(nifti_files)\n")
  cat("  \n")
  cat("  # Execute conversion with progress tracking\n")
  cat("  results <- pblapply(conversion_plan, function(plan) {\n")
  cat("    convert_single_file(plan$input, plan$output, \n")
  cat("                       compression = plan$compression)\n")
  cat("  })\n")
  cat("  \n")
  cat("  # Generate conversion report\n")
  cat("  create_conversion_report(results, output_dir)\n")
  cat("}\n\n")

  cat("# Validate converted data\n")
  cat("validate_conversion <- function(original_nifti, converted_h5) {\n")
  cat("  # Load both versions\n")
  cat("  nifti_data <- get_data_matrix(fmri_file_dataset(original_nifti))\n")
  cat("  h5_data <- get_data_matrix(fmri_h5_dataset(converted_h5))\n")
  cat("  \n")
  cat("  # Compare data integrity\n")
  cat("  correlation <- cor(as.vector(nifti_data), as.vector(h5_data))\n")
  cat("  max_diff <- max(abs(nifti_data - h5_data))\n")
  cat("  \n")
  cat("  cat('Data correlation:', correlation, '\\n')\n")
  cat("  cat('Maximum difference:', max_diff, '\\n')\n")
  cat("  \n")
  cat("  # Should be near-perfect for lossless compression\n")
  cat("  return(list(correlation = correlation, max_diff = max_diff))\n")
  cat("}\n\n")

  cat("Migration best practices:\n")
  cat("- Test conversion on subset first\n")
  cat("- Validate data integrity after conversion\n")
  cat("- Maintain original files until validation complete\n")
  cat("- Document compression settings and rationale\n")
  cat("- Plan storage migration strategy\n")
  cat("- Update analysis scripts for H5 backend\n")
}

demonstrate_h5_migration_tools()

Tips and Best Practices

Here are practical guidelines for working effectively with HDF5 storage in fmridataset, based on real-world experience with large neuroimaging studies.

Compression Configuration

Compression Level Selection: - Level 3: Frequent data access, active analysis phase - Level 6: Balanced performance for standard workflows - Level 9: Maximum compression for archival storage

Compression ratios typically range from 2-4x for fMRI data, depending on preprocessing status and data characteristics.

Data Integrity Validation

Required Validation Steps: 1. Execute validate_h5_dataset() after file creation 2. Verify checksums after network transfers 3. Validate structure after storage migrations 4. Test partial data access before full analysis

Chunk Size Optimization

Access Pattern Configuration: - Temporal analyses: Configure larger time dimension chunks (e.g., 50-100 timepoints) - Spatial analyses: Configure larger spatial chunks (e.g., 10000+ voxels) - Mixed access: Use balanced chunks (e.g., 20 timepoints × 5000 voxels)

Optimal chunk size depends on available memory and typical query patterns.

Storage Planning Guidelines

# Guidelines for H5 storage planning
provide_storage_planning_guidance <- function() {
  cat("HDF5 Storage Planning Guidelines:\n\n")

  cat("1. Compression Strategy by Study Phase:\n")
  cat("   - Active analysis: Level 3-6 compression\n")
  cat("   - Archive storage: Level 6-9 compression\n")
  cat("   - Sharing/transfer: Level 6+ (balance size/speed)\n\n")

  cat("2. File Organization Patterns:\n")
  cat("   - Small studies (<50 subjects): Per-subject H5 files\n")
  cat("   - Large studies (>100 subjects): Consolidated H5 by task\n")
  cat("   - Longitudinal studies: Session-based organization\n\n")

  cat("3. Infrastructure Considerations:\n")
  cat("   - Local SSD: Any compression, optimize for speed\n")
  cat("   - Network storage: Higher compression, larger chunks\n")
  cat("   - Cloud storage: Maximum compression, object count critical\n\n")

  cat("4. Access Pattern Optimization:\n")
  cat("   - Frequent ROI analysis: Spatial chunking\n")
  cat("   - Time series analysis: Temporal chunking\n")
  cat("   - Mixed analysis: Balanced square chunks\n")
}

provide_storage_planning_guidance()

# Provide troubleshooting guidelines
provide_h5_troubleshooting_guide <- function() {
  cat("\nH5 Troubleshooting Guide:\n\n")

  cat("Common Issues and Solutions:\n\n")

  cat("1. 'Cannot read H5 file' errors:\n")
  cat("   - Check file permissions and path accessibility\n")
  cat("   - Verify H5 file integrity with h5dump or h5ls\n")
  cat("   - Ensure fmristore package is installed\n")
  cat("   - Check for file corruption after network transfer\n\n")

  cat("2. Slow H5 access performance:\n")
  cat("   - Check chunk alignment with access patterns\n")
  cat("   - Consider reducing compression level (3-6)\n")
  cat("   - Enable appropriate caching strategy\n")
  cat("   - Verify storage system performance\n\n")

  cat("3. Memory issues with H5 datasets:\n")
  cat("   - Ensure preload=FALSE for large datasets\n")
  cat("   - Use smaller chunk sizes in data_chunks()\n")
  cat("   - Process data in run-wise chunks\n")
  cat("   - Monitor memory usage with pryr::mem_used()\n\n")

  cat("4. H5 file compatibility issues:\n")
  cat("   - Check H5 file internal structure with h5ls\n")
  cat("   - Verify data_dataset and mask_dataset paths\n")
  cat("   - Ensure proper NeuroVec/fmristore format\n")
  cat("   - Test with h5_backend() directly\n\n")

  cat("Diagnostic commands:\n")
  cat("# Check H5 file structure\n")
  cat("system('h5ls -r filename.h5')\n\n")
  cat("# Verify data integrity\n")
  cat("system('h5dump -n filename.h5')\n\n")
  cat("# Test basic H5 operations\n")
  cat("library(hdf5r)\n")
  cat("h5file <- H5File$new('filename.h5', mode = 'r')\n")
  cat("h5file$ls(recursive = TRUE)\n")
  cat("h5file$close()\n")
}

provide_h5_troubleshooting_guide()

Performance Optimization Strategies

# Advanced performance optimization for H5 datasets
demonstrate_h5_performance_optimization <- function() {
  cat("Advanced H5 Performance Optimization:\n\n")

  cat("1. Memory Management Strategies:\n")
  cat("optimize_h5_memory <- function(h5_dataset, analysis_type) {\n")
  cat("  if (analysis_type == 'connectivity') {\n")
  cat("    # Spatial-focused analysis\n")
  cat("    config <- list(\n")
  cat("      preload = FALSE,\n")
  cat("      cache_strategy = 'spatial',\n")
  cat("      chunk_preference = 'voxel_wise'\n")
  cat("    )\n")
  cat("  } else if (analysis_type == 'temporal') {\n")
  cat("    # Time series analysis\n")
  cat("    config <- list(\n")
  cat("      preload = FALSE,\n")
  cat("      cache_strategy = 'temporal', \n")
  cat("      chunk_preference = 'time_wise'\n")
  cat("    )\n")
  cat("  } else if (analysis_type == 'exploratory') {\n")
  cat("    # Mixed access patterns\n")
  cat("    config <- list(\n")
  cat("      preload = TRUE,  # Small datasets only\n")
  cat("      cache_strategy = 'aggressive',\n")
  cat("      chunk_preference = 'balanced'\n")
  cat("    )\n")
  cat("  }\n")
  cat("  \n")
  cat("  return(configure_h5_backend(h5_dataset, config))\n")
  cat("}\n\n")

  cat("2. I/O Pattern Optimization:\n")
  cat("# Optimize chunking for specific workflows\n")
  cat("optimize_chunking_for_workflow <- function(h5_dataset, workflow) {\n")
  cat("  h5_info <- get_h5_file_info(h5_dataset)\n")
  cat("  \n")
  cat("  if (workflow == 'group_analysis') {\n")
  cat("    # Minimize cross-subject I/O\n")
  cat("    chunks <- data_chunks(h5_dataset, \n")
  cat("                         nchunks = h5_info$optimal_chunks,\n")
  cat("                         align_with_h5 = TRUE)\n")
  cat("  } else if (workflow == 'single_subject_detailed') {\n")
  cat("    # Optimize for complete subject processing\n")
  cat("    chunks <- data_chunks(h5_dataset,\n")
  cat("                         runwise = TRUE,\n")
  cat("                         preload_runs = TRUE)\n")
  cat("  }\n")
  cat("  \n")
  cat("  return(chunks)\n")
  cat("}\n\n")

  cat("3. Network and Storage Optimization:\n")
  cat("# Configure for different storage systems\n")
  cat("configure_for_storage_system <- function(storage_type) {\n")
  cat("  if (storage_type == 'local_ssd') {\n")
  cat("    return(list(compression = 3, chunk_size = 'large'))\n")
  cat("  } else if (storage_type == 'network_nfs') {\n")
  cat("    return(list(compression = 6, chunk_size = 'medium', \n")
  cat("               cache_strategy = 'aggressive'))\n")
  cat("  } else if (storage_type == 'cloud_object') {\n")
  cat("    return(list(compression = 9, chunk_size = 'large',\n")
  cat("               minimize_requests = TRUE))\n")
  cat("  }\n")
  cat("}\n\n")

  cat("4. Monitoring and Profiling:\n")
  cat("# Profile H5 dataset performance\n")
  cat("profile_h5_performance <- function(h5_dataset) {\n")
  cat("  profiling_results <- list()\n")
  cat("  \n")
  cat("  # Test different access patterns\n")
  cat("  access_patterns <- c('full_load', 'run_wise', 'roi_based', 'temporal_window')\n")
  cat("  \n")
  cat("  for (pattern in access_patterns) {\n")
  cat("    timing <- system.time({\n")
  cat("      test_access_pattern(h5_dataset, pattern)\n")
  cat("    })\n")
  cat("    profiling_results[[pattern]] <- timing['elapsed']\n")
  cat("  }\n")
  cat("  \n")
  cat("  # Identify optimal strategies\n")
  cat("  return(analyze_profiling_results(profiling_results))\n")
  cat("}\n")
}

demonstrate_h5_performance_optimization()

# Provide configuration templates for common scenarios
provide_h5_configuration_templates <- function() {
  cat("\nH5 Configuration Templates:\n\n")

  cat("# Template 1: High-performance local analysis\n")
  cat("local_analysis_config <- list(\n")
  cat("  compression = 3,           # Light compression for speed\n")
  cat("  preload = TRUE,           # If dataset fits in memory\n")
  cat("  cache_strategy = 'aggressive',\n")
  cat("  chunk_size = c(50, 2000), # Balanced chunks\n")
  cat("  data_type = 'FLOAT'       # Standard precision\n")
  cat(")\n\n")

  cat("# Template 2: Memory-constrained analysis\n")
  cat("memory_constrained_config <- list(\n")
  cat("  compression = 6,           # Good compression\n")
  cat("  preload = FALSE,          # Lazy loading\n")
  cat("  cache_strategy = 'minimal',\n")
  cat("  chunk_size = c(25, 1000), # Smaller chunks\n")
  cat("  process_runwise = TRUE    # Process runs separately\n")
  cat(")\n\n")

  cat("# Template 3: Archive/sharing storage\n")
  cat("archive_config <- list(\n")
  cat("  compression = 9,           # Maximum compression\n")
  cat("  preload = FALSE,          # Storage-optimized\n")
  cat("  data_type = 'INT16',      # Reduced precision if appropriate\n")
  cat("  chunk_size = c(100, 5000), # Large chunks for efficiency\n")
  cat("  include_metadata = TRUE   # Rich metadata for sharing\n")
  cat(")\n\n")

  cat("# Template 4: Cloud/HPC deployment\n")
  cat("cloud_hpc_config <- list(\n")
  cat("  compression = 6,           # Balanced for network transfer\n")
  cat("  preload = FALSE,          # Distributed memory\n")
  cat("  cache_strategy = 'distributed',\n")
  cat("  chunk_size = 'auto',      # Let system optimize\n")
  cat("  parallel_io = TRUE       # Enable parallel access\n")
  cat(")\n")
}

provide_h5_configuration_templates()

Troubleshooting H5 Issues

When working with HDF5 datasets, certain issues are common and can be systematically diagnosed and resolved.

File Access and Integrity Issues

“Error: Cannot read H5 file”
Check file permissions, verify the file exists, and ensure fmristore is properly installed. Use h5ls filename.h5 to test basic file access.
“H5 file appears corrupted”
This often occurs after network transfers. Use h5dump -n filename.h5 to check file structure integrity, and consider re-transferring with checksums.
# Diagnostic functions for H5 issues
diagnose_h5_issues <- function(h5_file) {
  cat("Diagnosing H5 file issues for:", h5_file, "\n\n")

  # Check basic file properties
  if (!file.exists(h5_file)) {
    cat("ERROR: File does not exist\n")
    return(FALSE)
  }

  cat("File size:", round(file.size(h5_file) / 1024^2, 1), "MB\n")
  cat("File permissions:", file.access(h5_file, mode = 4), "(0 = readable)\n")

  # Test H5 library access
  tryCatch(
    {
      if (requireNamespace("hdf5r", quietly = TRUE)) {
        h5file <- hdf5r::H5File$new(h5_file, mode = "r")
        structure <- h5file$ls(recursive = TRUE)
        h5file$close()
        cat("H5 structure accessible: YES\n")
        cat("Internal groups/datasets:", length(structure), "\n")
      }
    },
    error = function(e) {
      cat("H5 structure accessible: NO\n")
      cat("Error:", conditionMessage(e), "\n")
    }
  )

  # Test with fmridataset
  tryCatch(
    {
      test_dataset <- fmri_h5_dataset(h5_file, TR = 2.0, run_length = 100)
      cat("fmridataset compatibility: YES\n")
    },
    error = function(e) {
      cat("fmridataset compatibility: NO\n")
      cat("Error:", conditionMessage(e), "\n")
    }
  )
}

# diagnose_h5_issues("problematic_scan.h5")

Performance Troubleshooting

Slow H5 data access
Check if chunk sizes align with your access patterns, consider reducing compression level, and verify storage system performance.
High memory usage with H5 datasets
Ensure preload=FALSE for large datasets, use smaller chunk sizes, and process data run-wise rather than loading entire datasets.
# Performance troubleshooting for H5 datasets
troubleshoot_h5_performance <- function(h5_dataset) {
  cat("H5 Performance Troubleshooting:\n\n")

  # Check dataset configuration
  cat("1. Dataset Configuration:\n")
  cat("   Preload setting:", h5_dataset$backend$preload, "\n")
  cat("   Cache strategy:", h5_dataset$backend$cache_strategy %||% "default", "\n")

  # Estimate memory usage
  dims <- dim(get_data_matrix(h5_dataset, run_id = 1))
  estimated_size <- prod(dims) * 4 / 1024^2 # Assume 4 bytes per float
  cat("   Estimated memory per run:", round(estimated_size, 1), "MB\n\n")

  # Test access patterns
  cat("2. Access Pattern Performance:\n")
  if (requireNamespace("microbenchmark", quietly = TRUE)) {
    mb <- microbenchmark::microbenchmark(
      single_run = get_data_matrix(h5_dataset, run_id = 1),
      small_chunk = {
        chunks <- data_chunks(h5_dataset, nchunks = 10)
        chunks[[1]]$data
      },
      times = 3
    )
    print(summary(mb))
  }

  # Recommendations
  cat("\n3. Optimization Recommendations:\n")
  if (estimated_size > 500) {
    cat("   - Large dataset detected: Use chunked processing\n")
    cat("   - Consider preload=FALSE and smaller chunk sizes\n")
  }

  if (h5_dataset$backend$preload) {
    cat("   - Preloading enabled: Good for repeated access\n")
  } else {
    cat("   - Lazy loading: Good for memory efficiency\n")
  }
}

# troubleshoot_h5_performance(h5_dataset)

Integration with Other Vignettes

This HDF5 storage guide connects to several other aspects of the fmridataset ecosystem:

Prerequisites: Start with Getting Started to understand the basic dataset interface before exploring H5-specific features.

Architecture Understanding: The Architecture Overview explains how H5 backends fit into the overall storage abstraction system.

Scaling Applications: - Study-Level Analysis - H5 storage provides significant advantages for large multi-subject studies - Backend Registry - See how H5 backends integrate with the pluggable storage system

Advanced Development: Extending Backends shows how to create custom H5-based backends for specialized storage needs.

Ecosystem Integration: H5 datasets work seamlessly with the broader neuroimaging ecosystem: - fmristore: Provides the underlying H5 neuroimaging format - neuroim2: NeuroVec objects can be stored and loaded from H5 format - DelayedArray: Advanced lazy evaluation for memory-efficient operations - BiocParallel: Efficient parallel processing of H5-stored data

Session Information