Source code for pymia.data.extraction.extractor

import abc
import pickle
import typing
import os

import numpy as np
import SimpleITK as sitk

import pymia.data.conversion as conv
import pymia.data.definition as defs
import pymia.data.extraction.byte_converter as byte_converter
import pymia.data.indexexpression as expr
from . import reader as rd


[docs]class Extractor(abc.ABC): """Interface unifying the extraction of data from a dataset."""
[docs] @abc.abstractmethod def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """Extract data from the dataset. Args: reader (.Reader): Reader instance that can read from dataset. params (dict): Extraction parameters containing information such as subject index and index expression. extracted (dict): The dictionary to put the extracted data in. """ pass
[docs]class ComposeExtractor(Extractor): def __init__(self, extractors: list) -> None: """Composes many :class:`.Extractor` instances and behaves like an single :class:`.Extractor` instance. Args: extractors (list): A list of :class:`.Extractor` instances. """ super().__init__() self.extractors = extractors
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" for e in self.extractors: e.extract(reader, params, extracted)
[docs]class NamesExtractor(Extractor): def __init__(self, cache: bool = True, categories=(defs.KEY_IMAGES, defs.KEY_LABELS)) -> None: """Extracts the names of the entries within a category (e.g. "Flair", "T1" for the category "images"). Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_PLACEHOLDER_NAMES` with :obj:`str` content Args: cache (bool): Whether to cache the results. If :code:`True`, the dataset is only accessed once. :code:`True` is often preferred since the name entries are typically unique in the dataset. categories (tuple): Categories for which to extract the names. """ super().__init__() self.cache = cache self.cached_result = None self.categories = categories
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" if not self.cache or self.cached_result is None: d = self._extract(reader) self.cached_result = d else: d = self.cached_result for k, v in d.items(): extracted[k] = byte_converter.convert_to_string(v)
def _extract(self, reader: rd.Reader): d = {} for category in self.categories: d[defs.KEY_PLACEHOLDER_NAMES.format(category)] = reader.read(defs.LOC_NAMES_PLACEHOLDER.format(category)) return d
[docs]class SubjectExtractor(Extractor): """Extracts the subject's identification. Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_SUBJECT_INDEX` with :obj:`int` content - :const:`pymia.data.definition.KEY_SUBJECT` with :obj:`str` content """
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" extracted[defs.KEY_SUBJECT_INDEX] = params[defs.KEY_SUBJECT_INDEX] subject_index_expr = expr.IndexExpression(params[defs.KEY_SUBJECT_INDEX]) extracted[defs.KEY_SUBJECT] = byte_converter.convert_to_string(reader.read(defs.LOC_SUBJECT, subject_index_expr))
[docs]class IndexingExtractor(Extractor): def __init__(self, do_pickle: bool = False) -> None: """Extracts the index expression. Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_SUBJECT_INDEX` with :obj:`int` content - :const:`pymia.data.definition.KEY_INDEX_EXPR` with :class:`.IndexExpression` content Args: do_pickle (bool): whether to pickle the extracted :class:`.ImageProperties` instance. This is useful when applied with PyTorch DataLoader since it prevents from automatic translation to torch.Tensor. """ super().__init__() self.do_pickle = do_pickle
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" extracted[defs.KEY_SUBJECT_INDEX] = params[defs.KEY_SUBJECT_INDEX] index_expression = params[defs.KEY_INDEX_EXPR] if self.do_pickle: # pickle to prevent from problems since own class index_expression = pickle.dumps(index_expression) extracted[defs.KEY_INDEX_EXPR] = index_expression
[docs]class ImagePropertiesExtractor(Extractor): def __init__(self, do_pickle: bool = False) -> None: """ Extracts the image properties. Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_PROPERTIES` with :class:`.ImageProperties` content (or byte if :code:`do_pickle`) Args: do_pickle (bool): whether to pickle the extracted :class:`.ImageProperties` instance. This allows usage in multiprocessing environment. """ super().__init__() self.do_pickle = do_pickle
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" subject_index_expr = expr.IndexExpression(params[defs.KEY_SUBJECT_INDEX]) shape = reader.read(defs.LOC_IMGPROP_SHAPE, subject_index_expr).tolist() direction = reader.read(defs.LOC_IMGPROP_DIRECTION, subject_index_expr).tolist() spacing = reader.read(defs.LOC_IMGPROP_SPACING, subject_index_expr).tolist() origin = reader.read(defs.LOC_IMGPROP_ORIGIN, subject_index_expr).tolist() # todo: everything in memory? image = sitk.Image(shape, sitk.sitkUInt8) image.SetDirection(direction) image.SetSpacing(spacing) image.SetOrigin(origin) # todo number_of_components_per_pixel and pixel_id img_properties = conv.ImageProperties(image) if self.do_pickle: # pickle to prevent from problems since own class img_properties = pickle.dumps(img_properties) extracted[defs.KEY_PROPERTIES] = img_properties
[docs]class FilesExtractor(Extractor): def __init__(self, cache: bool = True, categories=(defs.KEY_IMAGES, defs.KEY_LABELS)) -> None: """Extracts the file paths. Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_FILE_ROOT` with :obj:`str` content - :const:`pymia.data.definition.KEY_PLACEHOLDER_FILES` with :obj:`str` content Args: cache (bool): Whether to cache the results. If :code:`True`, the dataset is only accessed once. :code:`True` is often preferred since the file name entries are typically unique in the dataset (i.e. independent of data chunks). categories (tuple): Categories for which to extract the file names. """ super().__init__() self.cache = cache self.cached_file_root = None self.categories = categories
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" subject_index_expr = expr.IndexExpression(params[defs.KEY_SUBJECT_INDEX]) if not self.cache or self.cached_file_root is None: file_root = reader.read(defs.LOC_FILES_ROOT) self.cached_file_root = file_root else: file_root = self.cached_file_root extracted[defs.KEY_FILE_ROOT] = byte_converter.convert_to_string(file_root) for category in self.categories: extracted[defs.KEY_PLACEHOLDER_FILES.format(category)] = byte_converter.convert_to_string( reader.read(defs.LOC_FILES_PLACEHOLDER.format(category), subject_index_expr))
[docs]class SelectiveDataExtractor(Extractor): def __init__(self, selection=None, category: str = defs.KEY_LABELS) -> None: """Extracts data of a given category selectively. Adds :obj:`category` as key to :obj:`extracted`, as well as - :const:`pymia.data.definition.KEY_PLACEHOLDER_NAMES_SELECTED` with :obj:`selection` content Args: selection (str, tuple): Entries (e.g., "T1", "T2") within the category to select. If selection is None, the class has the same behaviour as the DataExtractor and selects all entries. category (str): The category (e.g. "images") to extract data from. """ super().__init__() self.subject_entries = None if isinstance(selection, str): selection = (selection,) self.selection = selection self.category = category self.names_extractor = None # used in case that the names of the entries of the category are not extracted
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" if defs.KEY_PLACEHOLDER_NAMES.format(self.category) not in extracted: if self.names_extractor is None: self.names_extractor = NamesExtractor(cache=True, categories=(self.category, )) self.names_extractor.extract(reader, {}, extracted) if self.subject_entries is None: self.subject_entries = reader.get_subject_entries() if not reader.has(defs.LOC_DATA_PLACEHOLDER.format(self.category)): raise ValueError(f'SelectiveDataExtractor requires {self.category} to exist') subject_index = params[defs.KEY_SUBJECT_INDEX] index_expr = params[defs.KEY_INDEX_EXPR] index_str = self.subject_entries[subject_index] data = reader.read('{}/{}'.format(defs.LOC_DATA_PLACEHOLDER.format(self.category), index_str), index_expr) entry_names = extracted[defs.KEY_PLACEHOLDER_NAMES.format(self.category)] # type: list if self.selection is None: extracted[self.category] = byte_converter.convert_to_string(data) else: selection_indices = np.array([entry_names.index(s) for s in self.selection]) extracted[self.category] = np.take(data, selection_indices, axis=-1) if isinstance(data, list): # convert back to list extracted[self.category] = byte_converter.convert_to_string(extracted[self.category].tolist()) extracted[defs.KEY_PLACEHOLDER_NAMES_SELECTED.format(self.category)] = list(self.selection)
[docs]class RandomDataExtractor(Extractor): def __init__(self, selection=None, category: str = defs.KEY_LABELS) -> None: """Extracts data of a given category randomly. Adds :obj:`category` as key to :obj:`extracted`. - :const:`pymia.data.definition.KEY_PLACEHOLDER_NAMES_SELECTED` with :obj:`selection` content Args: selection (str, tuple): Entries (e.g., "T1", "T2") within the category to select an entry randomly from. If selection is None, an entry from all entries is randomly selected. category (str): The category (e.g. "images") to extract data from. """ super().__init__() self.subject_entries = None if isinstance(selection, str): selection = (selection,) self.selection = selection self.category = category self.names_extractor = None # used in case that the names of the entries of the category are not extracted
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" if defs.KEY_PLACEHOLDER_NAMES.format(self.category) not in extracted: if self.names_extractor is None: self.names_extractor = NamesExtractor(cache=True, categories=(self.category,)) self.names_extractor.extract(reader, {}, extracted) if self.subject_entries is None: self.subject_entries = reader.get_subject_entries() if not reader.has(defs.LOC_DATA_PLACEHOLDER.format(self.category)): raise ValueError(f'SelectiveDataExtractor requires {self.category} to exist') subject_index = params[defs.KEY_SUBJECT_INDEX] index_expr = params[defs.KEY_INDEX_EXPR] index_str = self.subject_entries[subject_index] data = reader.read('{}/{}'.format(defs.LOC_DATA_PLACEHOLDER.format(self.category), index_str), index_expr) entry_names = extracted[defs.KEY_PLACEHOLDER_NAMES.format(self.category)] # type: list if self.selection is None: selection_indices = np.array(range(len(entry_names))) else: selection_indices = np.array([entry_names.index(s) for s in self.selection]) random_index = [np.random.choice(selection_indices)] # as list to keep the last dimension with np.take extracted[self.category] = np.take(data, random_index, axis=-1) if isinstance(data, list): # convert back to list extracted[self.category] = byte_converter.convert_to_string(extracted[self.category].tolist()) extracted[defs.KEY_PLACEHOLDER_NAMES_SELECTED.format(self.category)] = [entry_names[random_index[0]]]
[docs]class ImagePropertyShapeExtractor(Extractor): def __init__(self, numpy_format: bool = True) -> None: """Extracts the shape image property of an image. Added key to :obj:`extracted`: - :const:`pymia.data.definition.KEY_SHAPE` with :obj:`tuple` content Args: numpy_format (bool): Whether the shape is numpy or ITK format (first and last dimension are swapped). """ super().__init__() self.numpy_format = numpy_format
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" subject_index_expr = expr.IndexExpression(params[defs.KEY_SUBJECT_INDEX]) shape = reader.read(defs.LOC_IMGPROP_SHAPE, subject_index_expr) if self.numpy_format: tmp = shape[0] shape[0] = shape[-1] shape[-1] = tmp extracted[defs.KEY_SHAPE] = tuple(shape.tolist())
[docs]class DataExtractor(Extractor): def __init__(self, categories=(defs.KEY_IMAGES, ), ignore_indexing: bool = False) -> None: """Extracts data of a given category. Adds :obj:`category` as key to :obj:`extracted`. Args: categories (tuple): Categories for which to extract the names. ignore_indexing (bool): Whether to ignore the indexing in :obj:`params`. This is useful when extracting entire images. """ super().__init__() self.categories = categories self.ignore_indexing = ignore_indexing self.subject_entries = None
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" if self.subject_entries is None: self.subject_entries = reader.get_subject_entries() subject_index = params[defs.KEY_SUBJECT_INDEX] index_expr = params[defs.KEY_INDEX_EXPR] index_str = self.subject_entries[subject_index] for category in self.categories: if self.ignore_indexing: data = reader.read('{}/{}'.format(defs.LOC_DATA_PLACEHOLDER.format(category), index_str)) else: data = reader.read('{}/{}'.format(defs.LOC_DATA_PLACEHOLDER.format(category), index_str), index_expr) extracted[category] = byte_converter.convert_to_string(data)
[docs]class PadDataExtractor(Extractor): def __init__(self, padding: typing.Union[tuple, typing.List[tuple]], extractor: Extractor, pad_fn=None): """Pads the data extracted by :obj:`extractor` Args: padding (tuple, list): Lengths of the tuple or the list must be equal to the number of dimensions of the extracted data. If tuple, values are considered as symmetric padding in each dimension. If list, the each entry must consist of a tuple indicating (left, right) padding for one dimension. extractor (.Extractor): The extractor performing the extraction of the data to be padded. pad_fn (callable, optional): Optional function performing the padding. Default is :meth:`PadDataExtractor.zero_pad`. """ super().__init__() if not (hasattr(extractor, 'categories') or hasattr(extractor, 'category')): raise ValueError('argument extractor needs to have the property "categories" or "category"') self.extractor = extractor self.pad_fn = pad_fn if self.pad_fn is None: self.pad_fn = PadDataExtractor.zero_pad if isinstance(padding[0], int): padding = [(pad, pad) for pad in padding] index_diffs = np.asarray(padding) index_diffs[:, 0] = -index_diffs[:, 0] self.index_diffs = index_diffs
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" index_expr = params[defs.KEY_INDEX_EXPR] # type: expr.IndexExpression # Make sure all indexing is done with slices (Example: (16,) will be changed to (slice(16, 17, None),) which # is equivalent), otherwise the following steps will be wrong; . if any(isinstance(s, int) for s in index_expr.expression): index_expr.set_indexing([slice(s, s + 1) if isinstance(s, int) else s for s in index_expr.expression]) padded_indexing = np.asarray(index_expr.get_indexing()) + self.index_diffs padded_shape = tuple((padded_indexing[:, 1] - padded_indexing[:, 0]).tolist()) sub_indexing = padded_indexing.copy() sub_indexing[padded_indexing > 0] = 0 sub_indexing = -sub_indexing padded_indexing[padded_indexing < 0] = 0 # cannot slice outside the boundary in negative (but positive works!) padded_index_expr = expr.IndexExpression(padded_indexing.tolist()) padded_params = params.copy() padded_params[defs.KEY_INDEX_EXPR] = padded_index_expr self.extractor.extract(reader, padded_params, extracted) categories = self.extractor.categories if hasattr(self.extractor, 'categories') else [self.extractor.category] for category in categories: data = extracted[category] full_pad_shape = padded_shape + data.shape[len(padded_shape):] if full_pad_shape != data.shape: # we could not fully extract the padded shape, use pad_fn to pad data extracted[category] = self.pad_fn(data, full_pad_shape, sub_indexing)
@staticmethod def zero_pad(data: np.ndarray, pad_shape, sub_indexing): """""" pad_data = np.zeros(pad_shape, dtype=data.dtype) sub_indexing[:, 1] = sub_indexing[:, 0] + data.shape[:sub_indexing.shape[0]] sub_index_expr = expr.IndexExpression(sub_indexing.tolist()) pad_data[sub_index_expr.expression] = data return pad_data
[docs]class FilesystemDataExtractor(Extractor): @staticmethod def _load_stik(file_path: str, category: str): return sitk.GetArrayFromImage(sitk.ReadImage(file_path)) def __init__(self, categories=(defs.KEY_IMAGES, ), load_fn=None, ignore_indexing: bool = False, override_file_root=None) -> None: """Extracts data of a given category. Adds :obj:`category` as key to :obj:`extracted`. Args: categories (tuple): Categories for which to extract the names. load_fn (callable): Callable that loads a file given the file path and the category, and returns a numpy.ndarray. ignore_indexing (bool): Whether to ignore the indexing in :obj:`params`. This is useful when extracting entire images. """ super().__init__() self.categories = categories if load_fn is None: load_fn = self._load_stik self.load_fn = load_fn self.ignore_indexing = ignore_indexing self.cached_file_root = override_file_root
[docs] def extract(self, reader: rd.Reader, params: dict, extracted: dict) -> None: """see :meth:`.Extractor.extract`""" index_expr = params[defs.KEY_INDEX_EXPR] # type: expr.IndexExpression subject_index_expr = expr.IndexExpression(params[defs.KEY_SUBJECT_INDEX]) if self.cached_file_root is None: self.cached_file_root = byte_converter.convert_to_string(reader.read(defs.LOC_FILES_ROOT)) file_root = self.cached_file_root for category in self.categories: rel_file_paths = byte_converter.convert_to_string(reader.read(defs.LOC_FILES_PLACEHOLDER.format(category), subject_index_expr)) loaded = [] for rel_file_path in rel_file_paths: file_path = os.path.join(file_root, rel_file_path) loaded.append(self.load_fn(file_path, category)) data = np.stack(loaded, axis=-1) if not self.ignore_indexing: data = data[index_expr.expression] extracted[category] = data