Introduction to fmridataset

Introduction

The fmridataset package provides a unified S3 class system for representing functional magnetic resonance imaging (fMRI) data from different sources. The package supports data from raw matrices, NIfTI files, and pre-loaded NeuroVec objects, with a consistent interface for data access and manipulation.

Key Features

Unified Interface: Work with data from matrices, NIfTI files, or NeuroVec objects
Lazy Loading: File-based datasets load data only when accessed
Flexible Data Access: Get data for entire datasets or specific runs
Temporal Structure: Rich sampling frame representation with TR and run information
Data Chunking: Built-in support for parallel processing workflows
Event Integration: Support for experimental event tables

Package Philosophy

The fmri_dataset objects are designed to be:

Simple: Straightforward construction and usage
Consistent: Same interface regardless of data source
Efficient: Lazy loading and chunking for large datasets
Flexible: Support for various fMRI data formats and structures

Dataset Types

The package provides three main dataset types:

matrix_dataset: For data already loaded as R matrices
fmri_mem_dataset: For pre-loaded NeuroVec objects
fmri_file_dataset: For NIfTI files (lazy loading)
latent_dataset: For dimension-reduced data (requires fmristore package)

Creating fMRI Dataset Objects

Matrix Dataset

The simplest way to create an fmri_dataset is from a matrix:

library(fmridataset)

# Create synthetic fMRI data
set.seed(123)
n_timepoints <- 200
n_voxels <- 1000
fmri_matrix <- matrix(rnorm(n_timepoints * n_voxels),
  nrow = n_timepoints,
  ncol = n_voxels
)

# Create dataset with temporal structure
dataset <- matrix_dataset(
  datamat = fmri_matrix,
  TR = 2.0, # 2-second repetition time
  run_length = c(100, 100) # Two runs of 100 timepoints each
)

print(dataset)

From NIfTI Files

For real fMRI data stored as NIfTI files:

# Paths to your NIfTI files (one per run)
file_paths <- c(
  "/path/to/run1.nii.gz",
  "/path/to/run2.nii.gz"
)

# Path to brain mask
mask_path <- "/path/to/mask.nii.gz"

# Create dataset from files
dataset <- fmri_dataset(
  scans = file_paths,
  mask = mask_path,
  TR = 2.5,
  run_length = c(180, 180) # 180 timepoints per run
)

# Data is loaded only when accessed
print(dataset)

From NeuroVec Objects

If you have pre-loaded NeuroVec objects:

# Create example NeuroVec objects
d <- c(10, 10, 10, 100) # 10x10x10 voxels, 100 timepoints
nvec1 <- neuroim2::NeuroVec(array(rnorm(prod(d)), d),
  space = neuroim2::NeuroSpace(d)
)
nvec2 <- neuroim2::NeuroVec(array(rnorm(prod(d)), d),
  space = neuroim2::NeuroSpace(d)
)

# Create a mask
mask_dim <- d[1:3]
mask <- neuroim2::NeuroVol(array(1, mask_dim),
  space = neuroim2::NeuroSpace(mask_dim)
)

# Create dataset from NeuroVec objects
dataset <- fmri_mem_dataset(
  scans = list(nvec1, nvec2),
  mask = mask,
  TR = 2.0,
  run_length = c(100, 100)
)

print(dataset)

With Event Tables

Add experimental design information:

# Create event table
events <- data.frame(
  onset = c(10, 30, 50, 70, 110, 130, 150, 170),
  duration = c(2, 2, 2, 2, 2, 2, 2, 2),
  trial_type = rep(c("stimulus_A", "stimulus_B"), 4),
  run = c(1, 1, 1, 1, 2, 2, 2, 2)
)

# Create dataset with events
dataset <- matrix_dataset(
  datamat = fmri_matrix,
  TR = 2.0,
  run_length = c(100, 100),
  event_table = events
)

print(dataset)

Accessing Data and Metadata

Basic Data Access

# Get the complete data matrix
data_matrix <- get_data_matrix(dataset)
cat("Data dimensions:", dim(data_matrix), "\n")

# Get data for specific runs
run1_data <- get_data_matrix(dataset, run_id = 1)
cat("Run 1 dimensions:", dim(run1_data), "\n")

run23_data <- get_data_matrix(dataset, run_id = c(2, 3))
cat("Runs 2-3 dimensions:", dim(run23_data), "\n")

Alternative Data Access

# Using get_data() method (returns different format depending on dataset type)
data_obj <- get_data(dataset)

# For matrix datasets, this is equivalent to get_data_matrix
# For file datasets, this returns NeuroVec objects
# For mem datasets, this returns the original NeuroVec objects

Temporal Structure

# Access sampling frame
sf <- dataset$sampling_frame
print(sf)

# Get temporal properties using sampling frame methods
cat("TR:", get_TR(sf), "seconds\n")
cat("Number of runs:", n_runs(sf), "\n")
cat("Total timepoints:", n_timepoints(sf), "\n")
cat("Run lengths:", get_run_lengths(sf), "\n")

# Get block information
cat("Block IDs:", head(blockids(sf)), "...\n")
cat("Block lengths:", blocklens(sf), "\n")

Masks and Spatial Information

# Get mask
mask <- get_mask(dataset)
cat("Mask dimensions:", dim(mask), "\n")
cat("Voxels in mask:", sum(mask > 0), "\n")

# For volumetric datasets, mask is a NeuroVol
# For matrix datasets, mask is a simple vector

Event Information

# Access event table
events <- dataset$event_table
if (nrow(events) > 0) {
  print(head(events))

  # Analyze events by type
  event_counts <- table(events$trial_type)
  print(event_counts)
} else {
  cat("No events in dataset\n")
}

Data Chunking for Parallel Processing

The data_chunks() function enables efficient processing of large datasets:

# Create chunks for parallel processing
chunks <- data_chunks(dataset, nchunks = 4)

# Process each chunk
for (chunk in chunks) {
  cat(
    "Processing chunk", chunk$chunk_num,
    "with", ncol(chunk$data), "voxels and",
    nrow(chunk$data), "timepoints\n"
  )

  # Your analysis here...
  # result <- your_analysis_function(chunk$data)
}

Chunking Strategies

# Chunk by runs (one chunk per run)
run_chunks <- data_chunks(dataset, runwise = TRUE)
cat("Number of run chunks:", run_chunks$nchunks, "\n")

# Chunk by voxels (default)
voxel_chunks <- data_chunks(dataset, nchunks = 10)
cat("Number of voxel chunks:", voxel_chunks$nchunks, "\n")

# Single chunk (all data)
single_chunk <- data_chunks(dataset, nchunks = 1)
cat("Single chunk with all data\n")

Using with foreach

# Parallel processing with foreach
if (requireNamespace("foreach", quietly = TRUE)) {
  library(foreach)

  # Create chunks
  chunks <- data_chunks(dataset, nchunks = 4)

  # Process in parallel (or sequentially with %do%)
  results <- foreach(chunk = chunks) %do% {
    # Compute mean time series for this chunk
    colMeans(chunk$data)
  }

  cat("Processed", length(results), "chunks\n")
  cat("Each result has", length(results[[1]]), "values\n")
}

Type Conversions

Converting Between Dataset Types

# Convert any dataset to matrix format
matrix_version <- as.matrix_dataset(dataset)
cat(
  "Converted to matrix dataset with",
  ncol(matrix_version$datamat), "voxels\n"
)

# This loads all data into memory for file-based datasets
# Use with caution for large datasets

Advanced Examples

Working with Sampling Frames

# Create custom sampling frame
sf <- sampling_frame(run_lengths = c(150, 120, 180), TR = 1.5)

# Explore sampling frame properties
cat("Total duration:", get_total_duration(sf), "seconds\n")
cat("Run durations:", get_run_duration(sf), "seconds\n")
cat("Global onsets:", global_onsets(sf), "\n")

# Get sample indices for each run
sample_indices <- samples(sf)
cat("Samples for run 1:", head(sample_indices[[1]]), "...\n")

Custom Analysis Pipeline

# Define a simple analysis function
analyze_chunk <- function(chunk_data) {
  # Compute voxel-wise variance
  voxel_var <- apply(chunk_data, 2, var)

  # Compute temporal correlations
  if (ncol(chunk_data) > 1) {
    corr_matrix <- cor(chunk_data)
    mean_corr <- mean(corr_matrix[upper.tri(corr_matrix)])
  } else {
    mean_corr <- NA
  }

  list(
    n_voxels = ncol(chunk_data),
    n_timepoints = nrow(chunk_data),
    mean_variance = mean(voxel_var),
    mean_correlation = mean_corr
  )
}

# Apply to chunks
chunks <- data_chunks(dataset, nchunks = 5)
results <- list()

for (chunk in chunks) {
  results[[chunk$chunk_num]] <- analyze_chunk(chunk$data)
}

# Summarize results
for (i in seq_along(results)) {
  r <- results[[i]]
  cat(
    "Chunk", i, ": ", r$n_voxels, "voxels, mean var =",
    round(r$mean_variance, 3), "\n"
  )
}

Memory Management for Large Datasets

# For large file-based datasets, use chunking to manage memory
large_dataset_analysis <- function(dataset, analysis_func, nchunks = 10) {
  chunks <- data_chunks(dataset, nchunks = nchunks)
  results <- list()

  for (chunk in chunks) {
    # Process chunk
    chunk_result <- analysis_func(chunk$data)
    results[[chunk$chunk_num]] <- chunk_result

    # Optionally save intermediate results
    # saveRDS(chunk_result, paste0("chunk_", chunk$chunk_num, ".rds"))

    cat("Completed chunk", chunk$chunk_num, "/", chunks$nchunks, "\n")
  }

  return(results)
}

# Use the function
# results <- large_dataset_analysis(dataset, analyze_chunk)

Best Practices

Performance Tips

Use appropriate chunk sizes: Balance memory usage and processing overhead
Lazy loading: Keep file-based datasets as files until data is needed
Run-wise processing: Use runwise = TRUE for run-specific analyses
Memory monitoring: Monitor memory usage with large datasets

# Good: Process runs separately for run-specific analyses
run_means <- list()
run_chunks <- data_chunks(dataset, runwise = TRUE)
for (chunk in run_chunks) {
  run_means[[chunk$chunk_num]] <- colMeans(chunk$data)
}

# Good: Use appropriate chunk size for memory constraints
reasonable_chunks <- data_chunks(dataset, nchunks = 20)

# Caution: Converting large file datasets to matrix loads all data
# matrix_version <- as.matrix_dataset(very_large_file_dataset)  # May use lots of RAM

Data Validation

# Check dataset integrity
validate_basic <- function(dataset) {
  # Check that run lengths match data
  sf <- dataset$sampling_frame
  total_timepoints <- n_timepoints(sf)

  if ("datamat" %in% names(dataset)) {
    actual_timepoints <- nrow(dataset$datamat)
    if (total_timepoints != actual_timepoints) {
      warning("Mismatch between sampling frame and data dimensions")
    }
  }

  # Check event table
  if (nrow(dataset$event_table) > 0) {
    max_onset <- max(dataset$event_table$onset)
    total_duration <- get_total_duration(sf)
    if (max_onset > total_duration) {
      warning("Events extend beyond scan duration")
    }
  }

  cat("Dataset validation completed\n")
}

# validate_basic(dataset)

Reproducibility

# Document dataset creation parameters
dataset_info <- list(
  creation_date = Sys.time(),
  r_version = R.version.string,
  package_version = packageVersion("fmridataset"),
  dataset_type = class(dataset)[1],
  n_runs = dataset$nruns,
  TR = get_TR(dataset$sampling_frame),
  n_timepoints = n_timepoints(dataset$sampling_frame)
)

# Store with dataset or save separately
# saveRDS(dataset_info, "dataset_metadata.rds")
cat("Dataset info recorded for reproducibility\n")

Integration with Other Packages

NeuroVec Compatibility

# fmridataset works seamlessly with neuroim2
# File datasets return NeuroVec objects from get_data()
# Memory datasets store NeuroVec objects directly

# Example: Convert matrix dataset to NeuroVec
if (inherits(dataset, "matrix_dataset")) {
  # Create a simple 3D space for demonstration
  space_3d <- neuroim2::NeuroSpace(c(10, 10, 10))

  # Reshape matrix data to 4D array (x, y, z, time)
  # This is a simplified example - real use would preserve actual geometry
  n_voxels <- ncol(dataset$datamat)
  n_timepoints <- nrow(dataset$datamat)

  if (n_voxels == 1000) { # Our example case
    arr_4d <- array(t(dataset$datamat), c(10, 10, 10, n_timepoints))
    space_4d <- neuroim2::NeuroSpace(c(10, 10, 10, n_timepoints))
    nvec <- neuroim2::NeuroVec(arr_4d, space_4d)
    cat("Created NeuroVec with dimensions:", dim(nvec), "\n")
  }
}

Analysis Workflows

# Example: Simple GLM analysis
simple_glm_analysis <- function(dataset) {
  # Get design matrix from events
  events <- dataset$event_table
  if (nrow(events) == 0) {
    stop("No events found for GLM analysis")
  }

  # Create simple design matrix (stimulus A vs B)
  sf <- dataset$sampling_frame
  n_timepoints <- n_timepoints(sf)
  TR <- get_TR(sf)

  design_matrix <- matrix(0, nrow = n_timepoints, ncol = 2)
  colnames(design_matrix) <- c("stimulus_A", "stimulus_B")

  # Fill design matrix based on events
  for (i in 1:nrow(events)) {
    onset_tr <- round(events$onset[i] / TR) + 1
    if (onset_tr <= n_timepoints) {
      if (events$trial_type[i] == "stimulus_A") {
        design_matrix[onset_tr, 1] <- 1
      } else if (events$trial_type[i] == "stimulus_B") {
        design_matrix[onset_tr, 2] <- 1
      }
    }
  }

  # Fit GLM using chunking for memory efficiency
  chunks <- data_chunks(dataset, nchunks = 5)
  all_betas <- list()

  for (chunk in chunks) {
    data_chunk <- chunk$data

    # Fit GLM for each voxel in chunk
    n_voxels <- ncol(data_chunk)
    betas <- matrix(NA, nrow = 2, ncol = n_voxels)

    for (v in 1:n_voxels) {
      y <- data_chunk[, v]
      fit <- lm(y ~ design_matrix - 1) # No intercept, use design matrix directly
      betas[, v] <- coef(fit)
    }

    all_betas[[chunk$chunk_num]] <- betas
  }

  # Combine results
  final_betas <- do.call(cbind, all_betas)
  rownames(final_betas) <- colnames(design_matrix)

  return(final_betas)
}

# Apply analysis (if dataset has events)
# if (nrow(dataset$event_table) > 0) {
#   betas <- simple_glm_analysis(dataset)
#   cat("GLM analysis completed. Beta dimensions:", dim(betas), "\n")
# }

Conclusion

The fmridataset package provides a clean, consistent interface for working with fMRI data in R. Key advantages include:

Unified API across different data sources (matrices, files, NeuroVec objects)
Efficient memory usage through lazy loading and chunking
Rich temporal modeling with sampling frames
Flexible data access patterns
Integration with the neuroim2 ecosystem

The package is designed to be simple yet powerful, providing essential functionality for fMRI data management without unnecessary complexity. It serves as a solid foundation for building more sophisticated analysis pipelines.

For more detailed information about specific functions, see the package documentation with help(package = "fmridataset").

Session Info

sessionInfo()

```

Bradley Buchsbaum

2025-06-08