Source code for dival.datasets.standard

# -*- coding: utf-8 -*-
"""Provides standard datasets for benchmarking.
"""
from warnings import warn
from functools import partial
import numpy as np
import odl
from dival.datasets.ellipses_dataset import EllipsesDataset
from dival.datasets.lodopab_dataset import LoDoPaBDataset
from dival.datasets.angle_subset_dataset import get_angle_subset_dataset
from dival.util.odl_utility import ResizeOperator


STANDARD_DATASET_NAMES = ['ellipses', 'lodopab']



[docs]
def get_standard_dataset(name, **kwargs):
    """
    Return a standard dataset by name.

    The standard datasets are (currently):

        ``'ellipses'``
            A typical synthetical CT dataset with ellipse phantoms.

            `EllipsesDataset` is used as ground truth dataset, a ray
            transform with parallel beam geometry using 30 angles is applied,
            and white gaussian noise with a standard deviation of 2.5% (i.e.
            ``0.025 * mean(abs(observation))``) is added.

            In order to avoid the inverse crime, the ground truth images of
            shape (128, 128) are upscaled by bilinear interpolation to a
            resolution of (400, 400) before the ray transform is applied (whose
            discretization is different from the one of :attr:`ray_trafo`).
            
            Attributes of the returned dataset:
                `ray_trafo` : :class:`odl.tomo.RayTransform`
                    Ray transform corresponding to the noiseless forward
                    operator.
                ``get_ray_trafo(**kwargs)`` : function
                    Function that returns a ray transform corresponding to the
                    noiseless forward operator. Keyword arguments (e.g. `impl`)
                    are forwarded to the :class:`RayTransform` constructor.

        ``'lodopab'``
            The LoDoPaB-CT dataset, which is documented in the Data Descriptor
            article `<https://www.nature.com/articles/s41597-021-00893-z>`_ and
            hosted on `<https://zenodo.org/record/3384092>`_.
            It is a simulated low dose CT dataset based on real reconstructions
            from the `LIDC-IDRI
            <https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI>`_
            dataset.

            The dataset contains 42895 pairs of images and projection data.
            For simulation, a ray transform with parallel beam geometry using
            1000 angles and 513 detector pixels is used. Poisson noise
            corresponding to 4096 incident photons per pixel before attenuation
            is applied to the projection data.
            
            Attributes of the returned dataset:
                `ray_trafo` : :class:`odl.tomo.RayTransform`
                    Ray transform corresponding to the noiseless forward
                    operator.
            Methods of the returned dataset:
                ``get_ray_trafo(**kwargs)``
                    Function that returns a ray transform corresponding to the
                    noiseless forward operator. Keyword arguments (e.g. `impl`)
                    are forwarded to the :class:`RayTransform` constructor.

    Parameters
    ----------
    name : str
        Name of the dataset.
    kwargs : dict
        Keyword arguments.
        Supported parameters for the datasets are:

            ``'ellipses'``
                impl : {``'skimage'``, ``'astra_cpu'``, ``'astra_cuda'``},\
                        optional
                    Implementation passed to :class:`odl.tomo.RayTransform`
                    Default: ``'astra_cuda'``.
                fixed_seeds : dict or bool, optional
                    Seeds to use for random ellipse generation, passed to
                    :meth:`.EllipsesDataset.__init__`.
                    Default: ``False``.
                fixed_noise_seeds : dict or bool, optional
                    Seeds to use for noise generation, passed as `noise_seeds`
                    to :meth:`.GroundTruthDataset.create_pair_dataset`.
                    If ``True`` is passed (the default), the seeds
                    ``{'train': 1, 'validation': 2, 'test': 3}`` are used.
            ``'lodopab'``
                num_angles : int, optional
                    Number of angles to use from the full 1000 angles.
                    Must be a divisor of 1000.
                observation_model : {``'post-log'``, ``'pre-log'``}, optional
                    The observation model to use. Default is ``'post-log'``.
                min_photon_count : float, optional
                    Replacement value for a simulated photon count of zero.
                    If ``observation_model == 'post-log'``, a value greater
                    than zero is required in order to avoid undefined values.
                    The default is 0.1, both for ``'post-log'`` and
                    ``'pre-log'`` model.
                sorted_by_patient : bool, optional
                    Whether to sort the samples by patient id.
                    Useful to resplit the dataset.
                    Default: ``False``.
                impl : {``'skimage'``, ``'astra_cpu'``, ``'astra_cuda'``},\
                        optional
                    Implementation passed to :class:`odl.tomo.RayTransform`
                    Default: ``'astra_cuda'``.

    Returns
    -------
    dataset : :class:`.Dataset`
        The standard dataset.
        It has an attribute `standard_dataset_name` that stores its name.
    """
    name = name.lower()
    if name == 'ellipses':
        fixed_seeds = kwargs.pop('fixed_seeds', False)
        ellipses_dataset = EllipsesDataset(image_size=128,
                                           fixed_seeds=fixed_seeds)

        NUM_ANGLES = 30
        # image shape for simulation
        IM_SHAPE = (400, 400)  # images will be scaled up from (128, 128)

        reco_space = ellipses_dataset.space
        space = odl.uniform_discr(min_pt=reco_space.min_pt,
                                  max_pt=reco_space.max_pt,
                                  shape=IM_SHAPE, dtype=np.float32)

        reco_geometry = odl.tomo.parallel_beam_geometry(
            reco_space, num_angles=NUM_ANGLES)
        geometry = odl.tomo.parallel_beam_geometry(
            space, num_angles=NUM_ANGLES,
            det_shape=reco_geometry.detector.shape)

        impl = kwargs.pop('impl', 'astra_cuda')
        ray_trafo = odl.tomo.RayTransform(space, geometry, impl=impl)

        reco_ray_trafo = odl.tomo.RayTransform(reco_space, reco_geometry,
                                               impl=impl)

        # forward operator
        resize_op = ResizeOperator(reco_space, space)
        forward_op = ray_trafo * resize_op

        noise_seeds = kwargs.pop('fixed_noise_seeds', True)
        if isinstance(noise_seeds, bool):
            noise_seeds = ({'train': 1, 'validation': 2, 'test': 3}
                           if noise_seeds else None)

        dataset = ellipses_dataset.create_pair_dataset(
            forward_op=forward_op, noise_type='white',
            noise_kwargs={'relative_stddev': True,
                          'stddev': 0.025},
            noise_seeds=noise_seeds)

        dataset.get_ray_trafo = partial(odl.tomo.RayTransform,
                                        reco_space, reco_geometry)
        dataset.ray_trafo = reco_ray_trafo

    elif name == 'lodopab':

        num_angles = kwargs.pop('num_angles', None)
        lodopab_kwargs = {}
        for k in ['observation_model', 'min_photon_count', 'sorted_by_patient',
                  'impl']:
            if k in kwargs:
                lodopab_kwargs[k] = kwargs.pop(k)

        dataset = LoDoPaBDataset(**lodopab_kwargs)

        if num_angles is not None:
            dataset = get_angle_subset_dataset(
                dataset, num_angles, impl=kwargs.get('impl', 'astra_cuda'))

    else:
        raise ValueError("unknown dataset '{}'. Known standard datasets are {}"
                         .format(name, STANDARD_DATASET_NAMES))

    if kwargs:
        warn('unused keyword arguments: {}'
             .format(', '.join(kwargs.keys())))

    dataset.standard_dataset_name = name
    return dataset