Practical 3: Virtual stack visualization and explorative analysis#

When doing explorative analysis on large datasets, sometimes it’s not an option to load a full dataset into memory. Still, one would want to browse images and potentially try out processing workflows in an interactive manner.

In this notebook, we’ll build a simple lazy data viewer and interactively explore a large dataset.

We’ll create the “large” example dataset synthetically, but another option would be to download e.g. the following dataset: http://data.celltrackingchallenge.net/training-datasets/Fluo-N3DL-TRIC.zip. In this case,

# imports

import os, sys
import numpy as np
import tifffile
from scipy import ndimage
from tqdm import tqdm

import dask
import dask.array as da
from dask_image import ndfilters
from dask import delayed

%matplotlib notebook
# For improving the usability of this notebook,
# let's create an example 3D+T dataset instead of downloading one.

# create first timepoint
N_xy, N_z, N_t, N_rs, dx = 1000, 100, 10, [30, 5], 10

np.random.seed(0)
img = np.product([
    ndimage.zoom(np.random.random([N_r] * 3),
                 zoom=[N_z/N_r, N_xy/N_r, (N_xy + 2 * N_t * dx)/N_r], order=1)
    for N_r in N_rs], axis=0,
)

# convert into uint16
img = (img * 10000).astype(np.uint16)
# save it as a timelapse

file_pattern = 'data/large_3d_dataset_tp%03d.tif'

os.makedirs('data', exist_ok=True)

N_t = 20
for t in tqdm(range(N_t)):
    curr_tp = img[:, :, dx * t: dx * t + N_xy]
    tifffile.imwrite(file_pattern %t, curr_tp)

print('Total dataset size: %.2f GB' %(N_t * N_xy ** 2 * N_z * 2 / 10**9))

Loading the dataset into a dask array#

In order to lazily access images, we construct a dask array containing chunks that are computed by reading the corresponding image data from disk.

from glob import glob
import zarr

file_pattern = 'data/large_3d_dataset_tp*.tif'
files = sorted(glob(file_pattern))

# determine the shape and dtype of the data
zarr_arr = zarr.open(tifffile.imread(file_pattern, aszarr=True))

N_t, N_z, N_x, N_y = zarr_arr.shape
dtype = zarr_arr.dtype

print('Total dataset size is %s GB'
      %(np.product([N_t, N_z, N_x, N_y, dtype.itemsize]) / 1e9))
# define a custom reader function
# which loads a single 2D frame from a 3d tif file

def load_2d(t, z):
    return tifffile.TiffFile(files[t]).pages[z].asarray()

# loading should be lazy
load_2d = delayed(load_2d)

# manually compose a dask array from the individual lazily loaded frames
# `da.from_delayed` converts a delayed object into a dask array, given
# information about the shape and dtype of the delayed result
ims = da.stack([
        da.stack([
            da.from_delayed(load_2d(t, z),
                            shape=(N_x, N_y),
                            dtype=dtype)
            for z in range(N_z)])
        for t in range(N_t)])

ims

Visualize the dataset#

Since dask arrays essentially behave like numpy arrays, many viewers support the visualization of the previously constructed dask array.

Using tifffile#

# tifffile contains a multidimensional image viewer based on matplotlib

tifffile.imshow(ims)

Using napari#

Napari supports dask arrays.

import napari

viewer = napari.Viewer()

viewer.add_image(ims)

Using ipywidgets to interact with matplotlib plots#

%matplotlib notebook
import matplotlib.pyplot as plt
from ipywidgets import interact

# a simple multi-dimensional image viewer
def browse_images(ims, show_colorbar=False):

    plt.figure()
    
    # determine the shape of the non spatial dimensions
    scroll_shape = ims.shape[:-2]

    def view_image(**kwargs):
        pos = tuple(kwargs[dim] for dim in sorted(kwargs))
        plt.imshow(ims[tuple(pos)].T, cmap=plt.cm.gray_r, interpolation='nearest')
        if show_colorbar:
            plt.colorbar()

    # interact with the viewer using the non spatial dimensions
    interact(view_image,
             **{'dim %s' %dim: (0, s-1) for dim, s in enumerate(scroll_shape)})

    plt.show()
    
browse_images(ims)

Process the image on the fly#

In addition to viewing the raw images, we can perform operations on the array before viewing it.

# max projection as a simple example

browse_images(ims.max(-3))
# another example: local background subtraction

ims_mod = ims.astype(np.int32) - ndfilters.minimum_filter(ims, size=(1, 1, 30, 30))
ims_mod = np.clip(ims_mod, 0, 2**16 - 1)

browse_images(ims_mod, show_colorbar=False)
# another workflow

ims_max = ims.max(1)
ims_max = ims_max.rechunk((1, 600, 600))
ims_proc = ndfilters.gaussian_filter(ims_max, (0, 2, 2))
ims_proc = ims_proc.astype(float) - ndfilters.minimum_filter(ims_proc, (1, 50, 50))
ims_proc = ims_proc / ndfilters.maximum_filter(ims_proc.rechunk((1, 600, 600)), (1, 50, 50))
ims_proc = ims_proc > 0.5

browse_images(ims_proc)
# once we're happy with the workflow, we can compute the result
# and stream it into a file

from dask import diagnostics

with diagnostics.ProgressBar():
    da.to_zarr(ims_proc, 'data/processed.zarr', overwrite=True)

Obtain properties from objects#

from scipy import ndimage
from skimage import measure
import pandas as pd

def get_object_properties(im_binary, im_intensities, t):
    labels, _ = ndimage.label(im_binary)
    props = measure.regionprops_table(
        labels,
        intensity_image=im_intensities,
        properties=['label', 'centroid', 'area', 'mean_intensity'])
    props = pd.DataFrame(props)
    props['t'] = t
    return props

dfs = []
for t, im in enumerate(ims_proc[:3]):
    df = delayed(get_object_properties)(im, ims_max[t], t)
    dfs.append(df)

with dask.diagnostics.ProgressBar():
    df = pd.concat(dask.compute(dfs)[0], ignore_index=True)

df