Introduction to fmridataset
Bradley Buchsbaum
2025-06-08
Source:vignettes/fmridataset-intro.Rmd
fmridataset-intro.Rmd
Introduction
The fmridataset
package provides a unified S3 class
system for representing functional magnetic resonance imaging (fMRI)
data from different sources. The package supports data from raw
matrices, NIfTI files, and pre-loaded NeuroVec objects, with a
consistent interface for data access and manipulation.
Key Features
- Unified Interface: Work with data from matrices, NIfTI files, or NeuroVec objects
- Lazy Loading: File-based datasets load data only when accessed
- Flexible Data Access: Get data for entire datasets or specific runs
- Temporal Structure: Rich sampling frame representation with TR and run information
- Data Chunking: Built-in support for parallel processing workflows
- Event Integration: Support for experimental event tables
Dataset Types
The package provides three main dataset types:
-
matrix_dataset
: For data already loaded as R matrices -
fmri_mem_dataset
: For pre-loaded NeuroVec objects
-
fmri_file_dataset
: For NIfTI files (lazy loading) -
latent_dataset
: For dimension-reduced data (requires fmristore package)
Creating fMRI Dataset Objects
Matrix Dataset
The simplest way to create an fmri_dataset
is from a
matrix:
library(fmridataset)
# Create synthetic fMRI data
set.seed(123)
n_timepoints <- 200
n_voxels <- 1000
fmri_matrix <- matrix(rnorm(n_timepoints * n_voxels),
nrow = n_timepoints,
ncol = n_voxels
)
# Create dataset with temporal structure
dataset <- matrix_dataset(
datamat = fmri_matrix,
TR = 2.0, # 2-second repetition time
run_length = c(100, 100) # Two runs of 100 timepoints each
)
print(dataset)
From NIfTI Files
For real fMRI data stored as NIfTI files:
# Paths to your NIfTI files (one per run)
file_paths <- c(
"/path/to/run1.nii.gz",
"/path/to/run2.nii.gz"
)
# Path to brain mask
mask_path <- "/path/to/mask.nii.gz"
# Create dataset from files
dataset <- fmri_dataset(
scans = file_paths,
mask = mask_path,
TR = 2.5,
run_length = c(180, 180) # 180 timepoints per run
)
# Data is loaded only when accessed
print(dataset)
From NeuroVec Objects
If you have pre-loaded NeuroVec objects:
# Create example NeuroVec objects
d <- c(10, 10, 10, 100) # 10x10x10 voxels, 100 timepoints
nvec1 <- neuroim2::NeuroVec(array(rnorm(prod(d)), d),
space = neuroim2::NeuroSpace(d)
)
nvec2 <- neuroim2::NeuroVec(array(rnorm(prod(d)), d),
space = neuroim2::NeuroSpace(d)
)
# Create a mask
mask_dim <- d[1:3]
mask <- neuroim2::NeuroVol(array(1, mask_dim),
space = neuroim2::NeuroSpace(mask_dim)
)
# Create dataset from NeuroVec objects
dataset <- fmri_mem_dataset(
scans = list(nvec1, nvec2),
mask = mask,
TR = 2.0,
run_length = c(100, 100)
)
print(dataset)
With Event Tables
Add experimental design information:
# Create event table
events <- data.frame(
onset = c(10, 30, 50, 70, 110, 130, 150, 170),
duration = c(2, 2, 2, 2, 2, 2, 2, 2),
trial_type = rep(c("stimulus_A", "stimulus_B"), 4),
run = c(1, 1, 1, 1, 2, 2, 2, 2)
)
# Create dataset with events
dataset <- matrix_dataset(
datamat = fmri_matrix,
TR = 2.0,
run_length = c(100, 100),
event_table = events
)
print(dataset)
Accessing Data and Metadata
Basic Data Access
# Get the complete data matrix
data_matrix <- get_data_matrix(dataset)
cat("Data dimensions:", dim(data_matrix), "\n")
# Get data for specific runs
run1_data <- get_data_matrix(dataset, run_id = 1)
cat("Run 1 dimensions:", dim(run1_data), "\n")
run23_data <- get_data_matrix(dataset, run_id = c(2, 3))
cat("Runs 2-3 dimensions:", dim(run23_data), "\n")
Alternative Data Access
# Using get_data() method (returns different format depending on dataset type)
data_obj <- get_data(dataset)
# For matrix datasets, this is equivalent to get_data_matrix
# For file datasets, this returns NeuroVec objects
# For mem datasets, this returns the original NeuroVec objects
Temporal Structure
# Access sampling frame
sf <- dataset$sampling_frame
print(sf)
# Get temporal properties using sampling frame methods
cat("TR:", get_TR(sf), "seconds\n")
cat("Number of runs:", n_runs(sf), "\n")
cat("Total timepoints:", n_timepoints(sf), "\n")
cat("Run lengths:", get_run_lengths(sf), "\n")
# Get block information
cat("Block IDs:", head(blockids(sf)), "...\n")
cat("Block lengths:", blocklens(sf), "\n")
Data Chunking for Parallel Processing
The data_chunks()
function enables efficient processing
of large datasets:
# Create chunks for parallel processing
chunks <- data_chunks(dataset, nchunks = 4)
# Process each chunk
for (chunk in chunks) {
cat(
"Processing chunk", chunk$chunk_num,
"with", ncol(chunk$data), "voxels and",
nrow(chunk$data), "timepoints\n"
)
# Your analysis here...
# result <- your_analysis_function(chunk$data)
}
Chunking Strategies
# Chunk by runs (one chunk per run)
run_chunks <- data_chunks(dataset, runwise = TRUE)
cat("Number of run chunks:", run_chunks$nchunks, "\n")
# Chunk by voxels (default)
voxel_chunks <- data_chunks(dataset, nchunks = 10)
cat("Number of voxel chunks:", voxel_chunks$nchunks, "\n")
# Single chunk (all data)
single_chunk <- data_chunks(dataset, nchunks = 1)
cat("Single chunk with all data\n")
Using with foreach
# Parallel processing with foreach
if (requireNamespace("foreach", quietly = TRUE)) {
library(foreach)
# Create chunks
chunks <- data_chunks(dataset, nchunks = 4)
# Process in parallel (or sequentially with %do%)
results <- foreach(chunk = chunks) %do% {
# Compute mean time series for this chunk
colMeans(chunk$data)
}
cat("Processed", length(results), "chunks\n")
cat("Each result has", length(results[[1]]), "values\n")
}
Type Conversions
Converting Between Dataset Types
# Convert any dataset to matrix format
matrix_version <- as.matrix_dataset(dataset)
cat(
"Converted to matrix dataset with",
ncol(matrix_version$datamat), "voxels\n"
)
# This loads all data into memory for file-based datasets
# Use with caution for large datasets
Advanced Examples
Working with Sampling Frames
# Create custom sampling frame
sf <- sampling_frame(run_lengths = c(150, 120, 180), TR = 1.5)
# Explore sampling frame properties
cat("Total duration:", get_total_duration(sf), "seconds\n")
cat("Run durations:", get_run_duration(sf), "seconds\n")
cat("Global onsets:", global_onsets(sf), "\n")
# Get sample indices for each run
sample_indices <- samples(sf)
cat("Samples for run 1:", head(sample_indices[[1]]), "...\n")
Custom Analysis Pipeline
# Define a simple analysis function
analyze_chunk <- function(chunk_data) {
# Compute voxel-wise variance
voxel_var <- apply(chunk_data, 2, var)
# Compute temporal correlations
if (ncol(chunk_data) > 1) {
corr_matrix <- cor(chunk_data)
mean_corr <- mean(corr_matrix[upper.tri(corr_matrix)])
} else {
mean_corr <- NA
}
list(
n_voxels = ncol(chunk_data),
n_timepoints = nrow(chunk_data),
mean_variance = mean(voxel_var),
mean_correlation = mean_corr
)
}
# Apply to chunks
chunks <- data_chunks(dataset, nchunks = 5)
results <- list()
for (chunk in chunks) {
results[[chunk$chunk_num]] <- analyze_chunk(chunk$data)
}
# Summarize results
for (i in seq_along(results)) {
r <- results[[i]]
cat(
"Chunk", i, ": ", r$n_voxels, "voxels, mean var =",
round(r$mean_variance, 3), "\n"
)
}
Memory Management for Large Datasets
# For large file-based datasets, use chunking to manage memory
large_dataset_analysis <- function(dataset, analysis_func, nchunks = 10) {
chunks <- data_chunks(dataset, nchunks = nchunks)
results <- list()
for (chunk in chunks) {
# Process chunk
chunk_result <- analysis_func(chunk$data)
results[[chunk$chunk_num]] <- chunk_result
# Optionally save intermediate results
# saveRDS(chunk_result, paste0("chunk_", chunk$chunk_num, ".rds"))
cat("Completed chunk", chunk$chunk_num, "/", chunks$nchunks, "\n")
}
return(results)
}
# Use the function
# results <- large_dataset_analysis(dataset, analyze_chunk)
Best Practices
Performance Tips
- Use appropriate chunk sizes: Balance memory usage and processing overhead
- Lazy loading: Keep file-based datasets as files until data is needed
-
Run-wise processing: Use
runwise = TRUE
for run-specific analyses - Memory monitoring: Monitor memory usage with large datasets
# Good: Process runs separately for run-specific analyses
run_means <- list()
run_chunks <- data_chunks(dataset, runwise = TRUE)
for (chunk in run_chunks) {
run_means[[chunk$chunk_num]] <- colMeans(chunk$data)
}
# Good: Use appropriate chunk size for memory constraints
reasonable_chunks <- data_chunks(dataset, nchunks = 20)
# Caution: Converting large file datasets to matrix loads all data
# matrix_version <- as.matrix_dataset(very_large_file_dataset) # May use lots of RAM
Data Validation
# Check dataset integrity
validate_basic <- function(dataset) {
# Check that run lengths match data
sf <- dataset$sampling_frame
total_timepoints <- n_timepoints(sf)
if ("datamat" %in% names(dataset)) {
actual_timepoints <- nrow(dataset$datamat)
if (total_timepoints != actual_timepoints) {
warning("Mismatch between sampling frame and data dimensions")
}
}
# Check event table
if (nrow(dataset$event_table) > 0) {
max_onset <- max(dataset$event_table$onset)
total_duration <- get_total_duration(sf)
if (max_onset > total_duration) {
warning("Events extend beyond scan duration")
}
}
cat("Dataset validation completed\n")
}
# validate_basic(dataset)
Reproducibility
# Document dataset creation parameters
dataset_info <- list(
creation_date = Sys.time(),
r_version = R.version.string,
package_version = packageVersion("fmridataset"),
dataset_type = class(dataset)[1],
n_runs = dataset$nruns,
TR = get_TR(dataset$sampling_frame),
n_timepoints = n_timepoints(dataset$sampling_frame)
)
# Store with dataset or save separately
# saveRDS(dataset_info, "dataset_metadata.rds")
cat("Dataset info recorded for reproducibility\n")
Integration with Other Packages
NeuroVec Compatibility
# fmridataset works seamlessly with neuroim2
# File datasets return NeuroVec objects from get_data()
# Memory datasets store NeuroVec objects directly
# Example: Convert matrix dataset to NeuroVec
if (inherits(dataset, "matrix_dataset")) {
# Create a simple 3D space for demonstration
space_3d <- neuroim2::NeuroSpace(c(10, 10, 10))
# Reshape matrix data to 4D array (x, y, z, time)
# This is a simplified example - real use would preserve actual geometry
n_voxels <- ncol(dataset$datamat)
n_timepoints <- nrow(dataset$datamat)
if (n_voxels == 1000) { # Our example case
arr_4d <- array(t(dataset$datamat), c(10, 10, 10, n_timepoints))
space_4d <- neuroim2::NeuroSpace(c(10, 10, 10, n_timepoints))
nvec <- neuroim2::NeuroVec(arr_4d, space_4d)
cat("Created NeuroVec with dimensions:", dim(nvec), "\n")
}
}
Analysis Workflows
# Example: Simple GLM analysis
simple_glm_analysis <- function(dataset) {
# Get design matrix from events
events <- dataset$event_table
if (nrow(events) == 0) {
stop("No events found for GLM analysis")
}
# Create simple design matrix (stimulus A vs B)
sf <- dataset$sampling_frame
n_timepoints <- n_timepoints(sf)
TR <- get_TR(sf)
design_matrix <- matrix(0, nrow = n_timepoints, ncol = 2)
colnames(design_matrix) <- c("stimulus_A", "stimulus_B")
# Fill design matrix based on events
for (i in 1:nrow(events)) {
onset_tr <- round(events$onset[i] / TR) + 1
if (onset_tr <= n_timepoints) {
if (events$trial_type[i] == "stimulus_A") {
design_matrix[onset_tr, 1] <- 1
} else if (events$trial_type[i] == "stimulus_B") {
design_matrix[onset_tr, 2] <- 1
}
}
}
# Fit GLM using chunking for memory efficiency
chunks <- data_chunks(dataset, nchunks = 5)
all_betas <- list()
for (chunk in chunks) {
data_chunk <- chunk$data
# Fit GLM for each voxel in chunk
n_voxels <- ncol(data_chunk)
betas <- matrix(NA, nrow = 2, ncol = n_voxels)
for (v in 1:n_voxels) {
y <- data_chunk[, v]
fit <- lm(y ~ design_matrix - 1) # No intercept, use design matrix directly
betas[, v] <- coef(fit)
}
all_betas[[chunk$chunk_num]] <- betas
}
# Combine results
final_betas <- do.call(cbind, all_betas)
rownames(final_betas) <- colnames(design_matrix)
return(final_betas)
}
# Apply analysis (if dataset has events)
# if (nrow(dataset$event_table) > 0) {
# betas <- simple_glm_analysis(dataset)
# cat("GLM analysis completed. Beta dimensions:", dim(betas), "\n")
# }
Conclusion
The fmridataset
package provides a clean, consistent
interface for working with fMRI data in R. Key advantages include:
- Unified API across different data sources (matrices, files, NeuroVec objects)
- Efficient memory usage through lazy loading and chunking
- Rich temporal modeling with sampling frames
- Flexible data access patterns
- Integration with the neuroim2 ecosystem
The package is designed to be simple yet powerful, providing essential functionality for fMRI data management without unnecessary complexity. It serves as a solid foundation for building more sophisticated analysis pipelines.
For more detailed information about specific functions, see the
package documentation with
help(package = "fmridataset")
.