Practical 3: Virtual stack visualization and explorative analysis#
When doing explorative analysis on large datasets, sometimes it’s not an option to load a full dataset into memory. Still, one would want to browse images and potentially try out processing workflows in an interactive manner.
In this notebook, we’ll build a simple lazy data viewer and interactively explore a large dataset.
We’ll create the “large” example dataset synthetically, but another option would be to download e.g. the following dataset: http://data.celltrackingchallenge.net/training-datasets/Fluo-N3DL-TRIC.zip. In this case,
# imports
import os, sys
import numpy as np
import tifffile
from scipy import ndimage
from tqdm import tqdm
import dask
import dask.array as da
from dask_image import ndfilters
from dask import delayed
%matplotlib notebook
# For improving the usability of this notebook,
# let's create an example 3D+T dataset instead of downloading one.
# create first timepoint
N_xy, N_z, N_t, N_rs, dx = 1000, 100, 10, [30, 5], 10
np.random.seed(0)
img = np.product([
ndimage.zoom(np.random.random([N_r] * 3),
zoom=[N_z/N_r, N_xy/N_r, (N_xy + 2 * N_t * dx)/N_r], order=1)
for N_r in N_rs], axis=0,
)
# convert into uint16
img = (img * 10000).astype(np.uint16)
# save it as a timelapse
file_pattern = 'data/large_3d_dataset_tp%03d.tif'
os.makedirs('data', exist_ok=True)
N_t = 20
for t in tqdm(range(N_t)):
curr_tp = img[:, :, dx * t: dx * t + N_xy]
tifffile.imwrite(file_pattern %t, curr_tp)
print('Total dataset size: %.2f GB' %(N_t * N_xy ** 2 * N_z * 2 / 10**9))
Loading the dataset into a dask array#
In order to lazily access images, we construct a dask array containing chunks that are computed by reading the corresponding image data from disk.
from glob import glob
import zarr
file_pattern = 'data/large_3d_dataset_tp*.tif'
files = sorted(glob(file_pattern))
# determine the shape and dtype of the data
zarr_arr = zarr.open(tifffile.imread(file_pattern, aszarr=True))
N_t, N_z, N_x, N_y = zarr_arr.shape
dtype = zarr_arr.dtype
print('Total dataset size is %s GB'
%(np.product([N_t, N_z, N_x, N_y, dtype.itemsize]) / 1e9))
# define a custom reader function
# which loads a single 2D frame from a 3d tif file
def load_2d(t, z):
return tifffile.TiffFile(files[t]).pages[z].asarray()
# loading should be lazy
load_2d = delayed(load_2d)
# manually compose a dask array from the individual lazily loaded frames
# `da.from_delayed` converts a delayed object into a dask array, given
# information about the shape and dtype of the delayed result
ims = da.stack([
da.stack([
da.from_delayed(load_2d(t, z),
shape=(N_x, N_y),
dtype=dtype)
for z in range(N_z)])
for t in range(N_t)])
ims
Visualize the dataset#
Since dask arrays essentially behave like numpy arrays, many viewers support the visualization of the previously constructed dask array.
Using tifffile#
# tifffile contains a multidimensional image viewer based on matplotlib
tifffile.imshow(ims)
Using napari#
Napari supports dask arrays.
import napari
viewer = napari.Viewer()
viewer.add_image(ims)
Using ipywidgets to interact with matplotlib plots#
%matplotlib notebook
import matplotlib.pyplot as plt
from ipywidgets import interact
# a simple multi-dimensional image viewer
def browse_images(ims, show_colorbar=False):
plt.figure()
# determine the shape of the non spatial dimensions
scroll_shape = ims.shape[:-2]
def view_image(**kwargs):
pos = tuple(kwargs[dim] for dim in sorted(kwargs))
plt.imshow(ims[tuple(pos)].T, cmap=plt.cm.gray_r, interpolation='nearest')
if show_colorbar:
plt.colorbar()
# interact with the viewer using the non spatial dimensions
interact(view_image,
**{'dim %s' %dim: (0, s-1) for dim, s in enumerate(scroll_shape)})
plt.show()
browse_images(ims)
Process the image on the fly#
In addition to viewing the raw images, we can perform operations on the array before viewing it.
# max projection as a simple example
browse_images(ims.max(-3))
# another example: local background subtraction
ims_mod = ims.astype(np.int32) - ndfilters.minimum_filter(ims, size=(1, 1, 30, 30))
ims_mod = np.clip(ims_mod, 0, 2**16 - 1)
browse_images(ims_mod, show_colorbar=False)
# another workflow
ims_max = ims.max(1)
ims_max = ims_max.rechunk((1, 600, 600))
ims_proc = ndfilters.gaussian_filter(ims_max, (0, 2, 2))
ims_proc = ims_proc.astype(float) - ndfilters.minimum_filter(ims_proc, (1, 50, 50))
ims_proc = ims_proc / ndfilters.maximum_filter(ims_proc.rechunk((1, 600, 600)), (1, 50, 50))
ims_proc = ims_proc > 0.5
browse_images(ims_proc)
# once we're happy with the workflow, we can compute the result
# and stream it into a file
from dask import diagnostics
with diagnostics.ProgressBar():
da.to_zarr(ims_proc, 'data/processed.zarr', overwrite=True)
Obtain properties from objects#
from scipy import ndimage
from skimage import measure
import pandas as pd
def get_object_properties(im_binary, im_intensities, t):
labels, _ = ndimage.label(im_binary)
props = measure.regionprops_table(
labels,
intensity_image=im_intensities,
properties=['label', 'centroid', 'area', 'mean_intensity'])
props = pd.DataFrame(props)
props['t'] = t
return props
dfs = []
for t, im in enumerate(ims_proc[:3]):
df = delayed(get_object_properties)(im, ims_max[t], t)
dfs.append(df)
with dask.diagnostics.ProgressBar():
df = pd.concat(dask.compute(dfs)[0], ignore_index=True)
df