Source code for nanshe.learner

"""
The module ``learner`` provides the ability to run the segmentation algorithm.

===============================================================================
Overview
===============================================================================
The ``main`` function actually starts the algorithm and can
be called externally. The module allows for running multiple jobs through
|subprocess|_ or |drmaa|_. Configuration files for the learner are
provided in the examples_ and are entitled learner.

.. |subprocess| replace:: ``subprocess``
.. _subprocess: http://docs.python.org/2/library/subprocess.html
.. |drmaa| replace:: ``drmaa``
.. _drmaa: http://github.com/pygridtools/drmaa-python
.. _examples: http://github.com/nanshe-org/nanshe/tree/master/examples

===============================================================================
API
===============================================================================
"""


__author__ = "John Kirkham <kirkhamj@janelia.hhmi.org>"
__date__ = "$Apr 09, 2014 16:00:40 EDT$"


import os
import json
import itertools
import multiprocessing
import subprocess
import time

# Generally useful and fast to import so done immediately.
import numpy

import h5py

import nanshe

# Need in order to have logging information no matter what.
from nanshe.util import prof

from nanshe.util import iters, xnumpy, wrappers

from nanshe.io import hdf5

# Short function to process image data.
from nanshe.imp import segment

# For IO. Right now, just includes read_parameters for reading a config file.
from nanshe.io import xjson


# Get the loggers
trace_logger = prof.getTraceLogger(__name__)
logger = prof.logging.getLogger(__name__)



[docs]@prof.log_call(trace_logger)
def generate_neurons_io_handler(input_filename,
                                output_filename,
                                parameters_filename):
    """
        Uses generate_neurons to process a input_filename (HDF5 dataset) and
        outputs results to an output_filename (HDF5 dataset). Also,

        Args:
            input_filename          HDF5 filename to read from (should be a
                                    path to a h5py.Dataset)

            output_filename         HDF5 filename to write to (should be a path
                                    to a h5py.Group)

            parameters_filename     JSON filename with parameters.
    """

    # Extract and validate file extensions.

    # Parse parameter filename and validate that the name is acceptable
    parameters_filename_ext = os.path.splitext(parameters_filename)[1]
    parameters_filename_ext = parameters_filename_ext.lower().lstrip(os.extsep)
    # Clean up the extension so it fits the standard.
    if (parameters_filename_ext not in ["json"]):
        raise Exception(
            "Parameter file with filename: \"" + parameters_filename + "\"" +
            " provided with an unknown file extension: \"" +
            parameters_filename_ext + "\". If it is a " +
            "supported format, please run the given file through " +
            "nanshe_converter first before proceeding."
        )

    # Parse the parameters from the json file.
    parameters = xjson.read_parameters(parameters_filename)

    if (len(parameters) == 1) and ("generate_neurons_blocks" in parameters):
        generate_neurons_blocks(
            input_filename,
            output_filename,
            **parameters["generate_neurons_blocks"]
        )
    else:
        generate_neurons_a_block(input_filename, output_filename, **parameters)


[docs]@prof.log_call(trace_logger)
def generate_neurons_a_block(input_filename,
                             output_filename,
                             debug=False,
                             **parameters):
    """
        Uses generate_neurons to process a input_filename (HDF5 dataset) and
        outputs results to an output_filename (HDF5 dataset).

        Args:
            input_filename          HDF5 filename to read from (should be a
                                    path to a h5py.Dataset)

            output_filename         HDF5 filename to write to (should be a path
                                    to a h5py.Group)

            parameters              how the run should be configured.
    """

    # Extract and validate file extensions.

    # Parse input filename and validate that the name is acceptable
    input_filename_ext, input_dataset_name = hdf5.serializers.split_hdf5_path(input_filename)

    # Parse output filename and validate that the name is acceptable
    output_filename_ext, output_group_name = hdf5.serializers.split_hdf5_path(output_filename)


    # Read the input data.
    original_images = None
    with h5py.File(input_filename_ext, "r") as input_file_handle:
        original_images = hdf5.serializers.read_numpy_structured_array_from_HDF5(
            input_file_handle, input_dataset_name)
        original_images = original_images.astype(numpy.float32)

    # Write out the output.
    with h5py.File(output_filename_ext, "a") as output_file_handle:
        # Create a new output directory if doesn't exists.
        output_file_handle.require_group(output_group_name)

        # Group where all data will be stored.
        output_group = output_file_handle[output_group_name]

        # Create a soft link to the original images. But use the appropriate type of soft link depending on whether
        # the input and output file are the same.
        if "original_images" not in output_group:
            if input_filename_ext == output_filename_ext:
                output_group["original_images"] = h5py.SoftLink(
                    input_dataset_name
                )
            else:
                output_group["original_images"] = h5py.ExternalLink(
                    input_filename_ext, input_dataset_name
                )

        # Get a debug logger for the HDF5 file (if needed)
        array_debug_recorder = hdf5.record.generate_HDF5_array_recorder(
            output_group,
            group_name="debug",
            enable=debug,
            overwrite_group=False,
            recorder_constructor=hdf5.record.HDF5EnumeratedArrayRecorder
        )

        # Saves intermediate result to make resuming easier
        resume_logger = hdf5.record.generate_HDF5_array_recorder(
            output_group,
            recorder_constructor=hdf5.record.HDF5ArrayRecorder,
            overwrite=True
        )

        # Generate the neurons and attempt to resume if possible
        generate_neurons.resume_logger = resume_logger
        generate_neurons.recorders.array_debug_recorder = array_debug_recorder
        generate_neurons(
            original_images=original_images, **parameters["generate_neurons"]
        )

        # Save the configuration parameters in the attributes as a string.
        if "parameters" not in output_group.attrs:
            # Write the configuration parameters in the attributes as a string.
            output_group.attrs["parameters"] = repr(
                dict(list(parameters.items()) + [("debug", debug)])
            )


[docs]@prof.log_call(trace_logger)
def generate_neurons_blocks(input_filename,
                            output_filename,
                            num_processes=multiprocessing.cpu_count(),
                            block_shape=None,
                            num_blocks=None,
                            half_window_shape=None,
                            half_border_shape=None,
                            use_drmaa=False,
                            num_drmaa_cores=16,
                            debug=False,
                            **parameters):
    # TODO: Move function into new module with its own command line interface.
    # TODO: Heavy refactoring required on this function.

    # Extract and validate file extensions.

    # Parse input filename and validate that the name is acceptable
    input_filename_ext, input_dataset_name = hdf5.serializers.split_hdf5_path(input_filename)

    # Parse output filename and validate that the name is acceptable
    output_filename_ext, output_group_name = hdf5.serializers.split_hdf5_path(output_filename)


    # Directory where individual block runs will be stored.
    intermediate_output_dir = output_filename_ext.rsplit(
        os.path.splitext(output_filename_ext)[1], 1)[0] + "_blocks"


    # Read the input data.
    original_images_shape_array = None
    with h5py.File(input_filename_ext, "r") as input_file_handle:
        original_images_shape_array = numpy.array(
            input_file_handle[input_dataset_name].shape
        )

    # Get the amount of the border to slice
    half_border_shape_array = None
    if half_border_shape is None:
        half_border_shape_array = numpy.zeros(
            len(original_images_shape_array), dtype=int
        )
    else:
        assert (len(half_window_shape) == len(original_images_shape_array))

        half_border_shape_array = numpy.array(half_border_shape)

        # Should be of type integer
        assert (issubclass(half_border_shape_array.dtype.type, numpy.integer))

        # Should not cut along temporal portion.
        # Maybe replace with a warning.
        assert (half_border_shape[0] == 0)

    # TODO: Refactor to expanded_numpy.
    # Cuts boundaries from original_images_shape
    original_images_pared_shape_array = original_images_shape_array - \
                                        2*half_border_shape_array

    # At least one of them must be specified. If not some mixture of both.
    assert ((block_shape is not None) or (num_blocks is not None))

    # Size of the block to use by pixels
    block_shape_array = None
    block_shape_array_undefined = None
    if block_shape is None:
        block_shape_array = -numpy.ones(
            original_images_pared_shape_array.shape, dtype=int
        )
        block_shape_array_undefined = numpy.ones(
            original_images_pared_shape_array.shape, dtype=bool
        )
    else:
        # Should have the same number of values in each
        assert (len(original_images_pared_shape_array) == len(block_shape))

        block_shape_array = numpy.array(block_shape, dtype=int)

        # Should be of type integer
        assert issubclass(block_shape_array.dtype.type, numpy.integer)

        block_shape_array_undefined = (block_shape_array == -1)

    # Number of
    num_blocks_array = None
    num_blocks_array_undefined = None
    if num_blocks is None:
        num_blocks_array = - \
            numpy.ones(original_images_pared_shape_array.shape, dtype=int)
        num_blocks_array_undefined = numpy.ones(
            original_images_pared_shape_array.shape, dtype=bool)
    else:
        # Should have the same number of values in each
        assert (len(original_images_pared_shape_array) == len(num_blocks))

        num_blocks_array = numpy.array(num_blocks, dtype=int)

        # Should be of type integer
        assert issubclass(num_blocks_array.dtype.type, numpy.integer)

        num_blocks_array_undefined = (num_blocks_array == -1)

    # Want to ensure that both aren't defined.
    assert ~(~block_shape_array_undefined & ~num_blocks_array_undefined).all()

    # If both are undefined, then the block should span that dimension
    missing_both = (block_shape_array_undefined & num_blocks_array_undefined)
    block_shape_array[
        missing_both] = original_images_pared_shape_array[missing_both]
    num_blocks_array[missing_both] = 1
    # Thus, we have resolved these values and can continue.
    block_shape_array_undefined[missing_both] = False
    num_blocks_array_undefined[missing_both] = False

    # Replace undefined values in block_shape_array
    missing_block_shape_array, block_shape_array_remainder = divmod(
        original_images_pared_shape_array[block_shape_array_undefined],
        num_blocks_array[block_shape_array_undefined]
    )
    # Block shape must be well defined.
    assert (block_shape_array_remainder == 0).all()
    missing_block_shape_array = missing_block_shape_array.astype(int)
    block_shape_array[block_shape_array_undefined] = missing_block_shape_array

    # Replace undefined values in num_blocks_array
    missing_num_blocks_array, num_blocks_array_remainder = divmod(
        original_images_pared_shape_array[num_blocks_array_undefined],
        block_shape_array[num_blocks_array_undefined]
    )
    # Allow some blocks to be smaller
    missing_num_blocks_array += (num_blocks_array_remainder != 0).astype(int)
    num_blocks_array[num_blocks_array_undefined] = missing_num_blocks_array
    # Get the overlap window
    half_window_shape_array = None
    if half_window_shape is None:
        half_window_shape_array = block_shape_array / 2.0
    else:
        assert (len(half_window_shape) == len(
            original_images_pared_shape_array))

        half_window_shape_array = numpy.array(half_window_shape)

        assert issubclass(half_window_shape_array.dtype.type, numpy.integer)

    # Want to make our window size is at least as large as the one used for
    # the f0 calculation.
    if "extract_f0" in parameters["generate_neurons"]["preprocess_data"]:
        #assert (parameters["generate_neurons"]["preprocess_data"]["extract_f0"]["half_window_size"] == half_window_shape_array[0])
        assert (parameters["generate_neurons"]["preprocess_data"]["extract_f0"]["half_window_size"] <= half_window_shape_array[0])

    # Estimate bounds for each slice. Uses typical python [begin, end) for the
    # indices.
    estimated_bounds = numpy.zeros(
        tuple(num_blocks_array),
        dtype=(int, original_images_pared_shape_array.shape + (2,))
    )

    for each_block_indices in iters.index_generator(*num_blocks_array):
        for each_dim, each_block_dim_index in enumerate(each_block_indices):
            estimated_lower_bound = each_block_dim_index * block_shape_array[each_dim]
            estimated_upper_bound = (each_block_dim_index + 1) * block_shape_array[each_dim]

            estimated_bounds[each_block_indices][each_dim] = numpy.array([
                estimated_lower_bound, estimated_upper_bound
            ])

    original_images_pared_slices = numpy.zeros(
        estimated_bounds.shape[:-2],
        dtype=[("actual", int, estimated_bounds.shape[-2:]),
               ("windowed", int, estimated_bounds.shape[-2:]),
               ("windowed_stack_selection", int, estimated_bounds.shape[-2:]),
               ("windowed_block_selection", int, estimated_bounds.shape[-2:])])

    # Get the slice that is within bounds
    original_images_pared_slices["actual"] = estimated_bounds
    original_images_pared_slices["actual"][..., 0] = numpy.where(
        0 < original_images_pared_slices["actual"][..., 0],
        original_images_pared_slices["actual"][..., 0],
        0
    )
    original_images_pared_slices["actual"][..., 1] = numpy.where(
        original_images_pared_slices["actual"][..., 1] < original_images_pared_shape_array,
        original_images_pared_slices["actual"][..., 1],
        original_images_pared_shape_array
    )

    # Gets the defined half_window_size.
    window_addition = numpy.zeros(estimated_bounds.shape, dtype=int)
    window_addition[..., 0] = -half_window_shape_array
    window_addition[..., 1] = half_window_shape_array

    # Get the slice with a window added.
    original_images_pared_slices[
        "windowed"] = estimated_bounds + window_addition
    original_images_pared_slices["windowed"][..., 0] = numpy.where(
        0 < original_images_pared_slices["windowed"][..., 0],
        original_images_pared_slices["windowed"][..., 0],
        0
    )
    original_images_pared_slices["windowed"][..., 1] = numpy.where(
        original_images_pared_slices["windowed"][..., 1] < original_images_pared_shape_array,
        original_images_pared_slices["windowed"][..., 1],
        original_images_pared_shape_array
    )

    # Get the slice information to get the windowed block from the original
    # image stack.
    original_images_pared_slices["windowed_stack_selection"] = original_images_pared_slices["windowed"]
    original_images_pared_slices["windowed_stack_selection"] += xnumpy.expand_view(
        half_border_shape_array, reps_after=2
    )

    # Get slice information for the portion within
    # `original_images_pared_slices["windowed"]`, which corresponds to
    # `original_images_pared_slices["actual"]`.
    #original_images_pared_slices["windowed_block_selection"][..., 0] = 0
    original_images_pared_slices["windowed_block_selection"][..., 1] = (
        original_images_pared_slices["actual"][..., 1] - original_images_pared_slices["actual"][..., 0]
    )
    original_images_pared_slices["windowed_block_selection"][:] += xnumpy.expand_view(
        original_images_pared_slices["actual"][..., 0] - original_images_pared_slices["windowed"][..., 0],
        reps_after=2
    )

    # Get a directory for intermediate results.
    try:
        os.mkdir(intermediate_output_dir)
    except OSError:
        # If it already exists, that is fine.
        pass

    intermediate_config = intermediate_output_dir + "/" + "config.json"

    # Overwrite the config file always
    with open(intermediate_config, "w") as fid:
        json.dump(
            dict(list(parameters.items()) + list({"debug" : debug}.items())),
            fid,
            indent=4,
            separators=(",", " : ")
        )
        fid.write("\n")

    # Construct an HDF5 file for each block
    input_filename_block = []
    output_filename_block = []
    stdout_filename_block = []
    stderr_filename_block = []
    with h5py.File(output_filename_ext, "a") as output_file_handle:
        # Create a new output directory if doesn't exists.
        output_file_handle.require_group(output_group_name)

        output_group = output_file_handle[output_group_name]

        if "original_images" not in output_group:
            if input_filename_ext == output_filename_ext:
                output_group["original_images"] = h5py.SoftLink(
                    input_dataset_name
                )
            else:
                output_group["original_images"] = h5py.ExternalLink(
                    input_filename_ext,
                    "/" + input_dataset_name
                )

        output_group.require_group("blocks")

        output_group_blocks = output_group["blocks"]

        input_file_handle = None
        try:
            # Skipping using region refs.
            input_file_handle = h5py.File(
                input_filename_ext, "r"
            )
        except IOError:
            # File is already open
            input_file_handle = output_file_handle

        for i, i_str, sequential_block_i in iters.filled_stringify_enumerate(
                original_images_pared_slices.flat
        ):
            intermediate_basename_i = intermediate_output_dir + "/" + i_str

            # Hold redirected stdout and stderr for each subprocess.
            stdout_filename_block.append(
                intermediate_basename_i + os.extsep + "out")
            stderr_filename_block.append(
                intermediate_basename_i + os.extsep + "err")

            # Ensure that the blocks are corrected to deal with trimming of the image stack
            # Must be done after the calculation of
            # original_images_pared_slices["windowed_block_selection"].
            sequential_block_i_windowed = sequential_block_i["windowed_stack_selection"]
            slice_i = tuple(
                slice(_1, _2, 1) for _1, _2 in sequential_block_i_windowed
            )

            if i_str not in output_group_blocks:
                output_group_blocks[i_str] = []
                output_group_blocks[i_str].attrs["filename"] = input_file_handle.filename
                output_group_blocks[i_str].attrs["dataset"] = input_dataset_name
                output_group_blocks[i_str].attrs["slice"] = str(slice_i)

            block_i = output_group_blocks[i_str]

            with h5py.File(intermediate_basename_i + os.extsep + "h5", "a") as each_block_file_handle:
                # Create a soft link to the original images. But use the
                # appropriate type of soft link depending on whether
                # the input and output file are the same.
                if "original_images" not in each_block_file_handle:
                    each_block_file_handle["original_images"] = h5py.ExternalLink(
                        os.path.relpath(
                            block_i.file.filename, intermediate_output_dir
                        ),
                        block_i.name
                    )

                input_filename_block.append(
                    each_block_file_handle.filename + "/" + "original_images"
                )
                output_filename_block.append(
                    each_block_file_handle.filename + "/"
                )

        if input_file_handle != output_file_handle:
            input_file_handle.close()

    cur_module_dirpath = os.path.dirname(os.path.dirname(nanshe.__file__))
    cur_module_filepath = os.path.splitext(os.path.abspath(__file__))[0]
    cur_module_name = os.path.relpath(cur_module_filepath, cur_module_dirpath)
    cur_module_name = cur_module_name.replace(os.path.sep, ".")
    cur_module_filepath += os.extsep + "py"

    import sys

    python = sys.executable

    executable_run = ""
    executable_run += "from sys import argv, path, exit; "

    executable_run += "path[:] = [\"%s\"] + [_ for _ in path if _ != \"%s\"]; " % \
                      (cur_module_dirpath, cur_module_dirpath,)
    executable_run += "from %s import main; exit(main(*argv))" % \
                      (cur_module_name,)

    block_process_args_gen = iters.izip(
        itertools.repeat(python),
        itertools.repeat("-c"),
        itertools.repeat(executable_run),
        itertools.repeat(intermediate_config),
        input_filename_block,
        output_filename_block,
        stdout_filename_block,
        stderr_filename_block
    )

    if use_drmaa:
        # Attempt to import drmaa.
        # If it fails to import, either the user has no intent in using it or
        # forgot to install it. If it imports, but fails to find symbols,
        # then the user has not set DRMAA_LIBRARY_PATH or
        # does not have libdrmaa.so.
        try:
            import drmaa
        except ImportError:
            # python-drmaa is not installed.
            logger.error(
                "Was not able to import drmaa. " +
                "If this is meant to be run using the OpenGrid submission " +
                "system, then drmaa needs to be installed via pip or " +
                "easy_install."
            )
            raise
        except RuntimeError:
            # The drmaa library was not specified, but python-drmaa is
            # installed.
            logger.error(
                "Was able to import drmaa. " +
                "However, the drmaa library could not be found. Please " +
                "either specify the location of libdrmaa.so using the " +
                "DRMAA_LIBRARY_PATH environment variable or disable/remove " +
                "use_drmaa from the config file."
            )
            raise

        s=drmaa.Session()
        s.initialize()

        ready_processes = []
        for each_arg_pack in block_process_args_gen:
            ready_processes.append((each_arg_pack, s.createJobTemplate()))
            ready_processes[-1][1].jobName = os.path.basename(
                os.path.splitext(cur_module_filepath)[0]
            ) + "-" + os.path.basename(
                os.path.dirname(each_arg_pack[3].split(".h5")[0])
            ) + "-" + os.path.basename(each_arg_pack[3].split(".h5")[0])
            ready_processes[-1][1].remoteCommand = each_arg_pack[0]
            ready_processes[-1][1].args = each_arg_pack[1:-2]
            ready_processes[-1][1].jobEnvironment = os.environ
            ready_processes[-1][1].inputPath = "localhost:" + os.devnull
            ready_processes[-1][1].outputPath = "localhost:" + each_arg_pack[-2]
            ready_processes[-1][1].errorPath = "localhost:" + each_arg_pack[-1]
            ready_processes[-1][1].workingDirectory = os.getcwd()
            ready_processes[-1][1].nativeSpecification = "-pe batch " + str(num_drmaa_cores)


        running_processes = []
        for each_arg_pack, each_process_template in ready_processes:
            each_process_id = s.runJob(each_process_template)
            running_processes.append(
                (each_arg_pack, each_process_id, each_process_template)
            )
            logger.info(
                "Started new process ( \"" + " ".join(each_arg_pack) + "\" )."
            )

        start_queue_time = time.time()
        logger.info("Waiting for queued jobs to complete.")

        #finished_processes = []
        for each_arg_pack, each_process_id, each_process_template in running_processes:
            each_process_status = s.wait(each_process_id)

            if not each_process_status.hasExited:
                raise RuntimeError(
                    "The process (\"" + " ".join(each_arg_pack) +
                    "\") has exited prematurely."
                )

            logger.info(
                "Finished process ( \"" + " ".join(each_arg_pack) + "\" )."
            )
            s.deleteJobTemplate(each_process_template)
            #finished_processes.append((each_arg_pack, each_process_id))

        s.exit()

        end_queue_time = time.time()
        diff_queue_time = end_queue_time - start_queue_time

        logger.info(
            "Run time for queued jobs to complete is \""
            + str(diff_queue_time) + " s\"."
        )
    else:
        # TODO: Refactor into a separate class (have it return futures somehow)
        #finished_processes = []
        running_processes = []
        pool_tasks_empty = False
        while (not pool_tasks_empty) or len(running_processes):
            while (not pool_tasks_empty) and (len(running_processes) < num_processes):
                try:
                    each_arg_pack = next(block_process_args_gen)
                    each_arg_pack, each_stdout_filename, each_stderr_filename = each_arg_pack[:-2], each_arg_pack[-2], each_arg_pack[-1]
                    each_process = subprocess.Popen(
                        each_arg_pack,
                        stdout=open(each_stdout_filename, "w"),
                        stderr=open(each_stderr_filename, "w")
                    )

                    running_processes.append((each_arg_pack, each_process,))

                    logger.info(
                        "Started new process ( \"" + " ".join(each_arg_pack) + "\" )."
                    )
                except StopIteration:
                    pool_tasks_empty = True

            while ((not pool_tasks_empty) and
                       (len(running_processes) >= num_processes)) or \
                    (pool_tasks_empty and len(running_processes)):
                time.sleep(1)

                i = 0
                while i < len(running_processes):
                    if running_processes[i][1].poll() is not None:
                        logger.info(
                            "Finished process ( \"" +
                            " ".join(running_processes[i][0]) + "\" )."
                        )

                        #finished_processes.append(running_processes[i])
                        del running_processes[i]
                    else:
                        time.sleep(1)
                        i += 1

        # finished_processes = None

    start_time = time.time()
    logger.info("Starting merge over all blocks.")

    with h5py.File(output_filename_ext, "a") as output_file_handle:
        output_group = output_file_handle[output_group_name]

        new_neurons_set = segment.get_empty_neuron(
            shape=tuple(original_images_shape_array[1:]), dtype=float
        )

        for i, i_str, (output_filename_block_i, sequential_block_i) in iters.filled_stringify_enumerate(
                iters.izip(output_filename_block, original_images_pared_slices.flat)):
            windowed_slice_i = tuple(
                slice(_1, _2, 1) for _1, _2 in [(None, None)] + sequential_block_i["windowed_stack_selection"].tolist()[1:]
            )
            window_trimmed_i = tuple(
                slice(_1, _2, 1) for _1, _2 in sequential_block_i["windowed_block_selection"].tolist()
            )
            output_filename_block_i = output_filename_block_i.rstrip("/")

            with h5py.File(output_filename_block_i, "r") as each_block_file_handle:
                if "neurons" in each_block_file_handle:
                    neurons_block_i_smaller = hdf5.serializers.read_numpy_structured_array_from_HDF5(
                        each_block_file_handle, "/neurons"
                    )

                    neurons_block_i_windowed_count = numpy.squeeze(
                        numpy.apply_over_axes(
                            numpy.sum,
                            neurons_block_i_smaller["mask"].astype(float),
                            tuple(iters.irange(1, neurons_block_i_smaller["mask"].ndim))
                        )
                    )

                    if neurons_block_i_windowed_count.shape == tuple():
                        neurons_block_i_windowed_count = numpy.array(
                            [neurons_block_i_windowed_count])

                    neurons_block_i_non_windowed_count = numpy.squeeze(
                        numpy.apply_over_axes(
                            numpy.sum,
                            neurons_block_i_smaller["mask"][window_trimmed_i].astype(float),
                            tuple(iters.irange(1, neurons_block_i_smaller["mask"].ndim))
                        )
                    )

                    if neurons_block_i_non_windowed_count.shape == tuple():
                        neurons_block_i_non_windowed_count = numpy.array(
                            [neurons_block_i_non_windowed_count]
                        )

                    if len(neurons_block_i_non_windowed_count):
                        # Find ones that are inside the margins by more than
                        # half
                        neurons_block_i_acceptance = (
                            (neurons_block_i_non_windowed_count / neurons_block_i_windowed_count) > 0.5
                        )

                        logger.info(
                            "Accepted the following neurons %s from block %s."
                            % (
                                str(neurons_block_i_acceptance.nonzero()[0].tolist()),
                                i_str
                            )
                        )

                        # Take a subset of our previous neurons that are within
                        # the margins by half
                        neurons_block_i_accepted = neurons_block_i_smaller[neurons_block_i_acceptance]

                        neurons_block_i = numpy.zeros(
                            neurons_block_i_accepted.shape, dtype=new_neurons_set.dtype
                        )
                        neurons_block_i["mask"][windowed_slice_i] = neurons_block_i_accepted["mask"]
                        neurons_block_i["contour"][windowed_slice_i] = neurons_block_i_accepted["contour"]
                        neurons_block_i["image"][windowed_slice_i] = neurons_block_i_accepted["image"]

                        # Copy other properties
                        neurons_block_i["area"] = neurons_block_i_accepted["area"]
                        neurons_block_i["max_F"] = neurons_block_i_accepted["max_F"]
                        neurons_block_i["gaussian_mean"] = neurons_block_i_accepted["gaussian_mean"]
                        neurons_block_i["gaussian_cov"] = neurons_block_i_accepted["gaussian_cov"]
                        # TODO: Correct centroid to larger block position.
                        neurons_block_i["centroid"] = neurons_block_i_accepted["centroid"]
                        neurons_block_i["centroid"] += sequential_block_i["windowed_stack_selection"][1:, 0]

                        array_debug_recorder = hdf5.record.generate_HDF5_array_recorder(
                            output_group,
                            group_name="debug",
                            enable=debug,
                            overwrite_group=False,
                            recorder_constructor=hdf5.record.HDF5EnumeratedArrayRecorder
                        )

                        segment.merge_neuron_sets.recorders.array_debug_recorder = array_debug_recorder
                        new_neurons_set = segment.merge_neuron_sets(
                            new_neurons_set,
                            neurons_block_i,
                            **parameters["generate_neurons"]["postprocess_data"]["merge_neuron_sets"]
                        )
                    else:
                        logger.info(
                            "Accepted the following neurons %s from block %s." %
                            (
                                str([]),
                                i_str
                            )
                        )
                else:
                    logger.info(
                        "No neurons accepted as none were found for block"
                        " %s." %
                        i_str
                    )

        hdf5.serializers.create_numpy_structured_array_in_HDF5(
            output_group, "neurons", new_neurons_set, overwrite=True)

        if "parameters" not in output_group["neurons"].attrs:
            output_group["neurons"].attrs["parameters"] = repr(dict(
                list(parameters.items()) +
                [("block_shape", block_shape),
                 ("num_blocks", num_blocks),
                 ("half_window_shape", half_window_shape),
                 ("half_border_shape", half_border_shape),
                 ("use_drmaa", use_drmaa),
                 ("num_drmaa_cores", num_drmaa_cores),
                 ("debug", debug)]
            ))

    logger.info("Finished merge over all blocks.")
    end_time = time.time()

    diff_time = end_time - start_time

    logger.info(
        "Run time for merge over all blocks is \"" + str(diff_time) + " s\"."
    )


[docs]@prof.log_call(trace_logger)
@hdf5.record.static_subgrouping_array_recorders(array_debug_recorder=hdf5.record.EmptyArrayRecorder())
@wrappers.static_variables(resume_logger=hdf5.record.EmptyArrayRecorder())
def generate_neurons(original_images, run_stage="all", **parameters):
    if "original_images_max_projection" not in generate_neurons.recorders.array_debug_recorder:
        generate_neurons.recorders.array_debug_recorder["original_images_max_projection"] = xnumpy.add_singleton_op(
            numpy.max,
            original_images,
            axis=0
        )

    if "original_images_mean_projection" not in generate_neurons.recorders.array_debug_recorder:
        generate_neurons.recorders.array_debug_recorder["original_images_mean_projection"] = xnumpy.add_singleton_op(
            numpy.mean,
            original_images,
            axis=0
        )

    # Preprocess images
    new_preprocessed_images = generate_neurons.resume_logger.get(
        "preprocessed_images", None
    )
    if (new_preprocessed_images is None) or \
            (run_stage == "preprocessing") or \
            (run_stage == "all"):
        new_preprocessed_images = original_images.copy()
        segment.preprocess_data.recorders.array_debug_recorder = generate_neurons.recorders.array_debug_recorder
        new_preprocessed_images = segment.preprocess_data(
            new_preprocessed_images,
            out=new_preprocessed_images,
            **parameters["preprocess_data"]
        )
        generate_neurons.resume_logger["preprocessed_images"] = new_preprocessed_images

        if "preprocessed_images_max_projection" not in generate_neurons.recorders.array_debug_recorder:
            generate_neurons.recorders.array_debug_recorder["preprocessed_images_max_projection"] = xnumpy.add_singleton_op(
            numpy.max,
            new_preprocessed_images,
            axis=0
        )

    if run_stage == "preprocessing":
        return

    # Find the dictionary
    new_dictionary = generate_neurons.resume_logger.get("dictionary", None)
    if (new_dictionary is None) or \
            (run_stage == "dictionary") or \
            (run_stage == "all"):
        segment.generate_dictionary.recorders.array_debug_recorder = generate_neurons.recorders.array_debug_recorder
        new_dictionary = segment.generate_dictionary(
            new_preprocessed_images,
            **parameters["generate_dictionary"]
        )
        generate_neurons.resume_logger["dictionary"] = new_dictionary

        if "dictionary_max_projection" not in generate_neurons.recorders.array_debug_recorder:
            generate_neurons.recorders.array_debug_recorder["dictionary_max_projection"] = xnumpy.add_singleton_op(
                numpy.max,
                new_dictionary,
                axis=0
            )

    if run_stage == "dictionary":
        return

    # Find the neurons
    new_neurons = None
    new_neurons = generate_neurons.resume_logger.get("neurons", None)
    if (new_neurons is None) or \
            (run_stage == "postprocessing") or \
            (run_stage == "all"):
        segment.postprocess_data.recorders.array_debug_recorder = generate_neurons.recorders.array_debug_recorder
        new_neurons = segment.postprocess_data(
            new_dictionary,
            **parameters["postprocess_data"]
        )

        if new_neurons.size:
            generate_neurons.resume_logger["neurons"] = new_neurons

    if new_neurons.size == 0:
        logger.warning("No neurons were found in the data.")
    else:
        logger.info(
            "Found \"" + str(len(new_neurons)) +
            "\" neurons were found in the data."
        )


[docs]@prof.log_call(trace_logger)
def main(*argv):
    """
        Simple main function (like in C). Takes all arguments (as from
        sys.argv) and returns an exit status.

        Args:
            argv(list):     arguments (includes command line call).

        Returns:
            int:            exit code (0 if success)
    """

    # Only necessary if running main (normally if calling command line).
    # No point in importing otherwise.
    import argparse
    import threading

    memory_profiler_thread = threading.Thread(
        target=prof.memory_profiler, args=(logger,)
    )
    memory_profiler_thread.daemon = True
    memory_profiler_thread.start()

    argv = list(argv)

    # Creates command line parser
    parser = argparse.ArgumentParser(
        description="Parses input from the command line for a batch job."
    )

    # Takes a config file and then a series of one or more HDF5 files.
    parser.add_argument(
        "config_filename",
        metavar="CONFIG_FILE",
        type=str,
        help="JSON file that provides configuration options " +
             "for how to use dictionary learning on the input files."
    )
    parser.add_argument(
        "input_file",
        metavar="INPUT_FILE",
        type=str,
        nargs=1,
        help="HDF5 file with an array of images. " +
             "A dataset or video will be expected at the internal path. " +
             "Time must be the first dimension."
    )
    parser.add_argument(
        "output_file",
        metavar="OUTPUT_FILE",
        type=str,
        nargs=1,
        help="HDF5 file(s) to write output. " +
             "If a specific group is desired, " +
             "that should be included in the filename."
    )

    # Results of parsing arguments
    # (ignore the first one as it is the command line call).
    parsed_args = parser.parse_args(argv[1:])

    # Remove args from singleton lists
    parsed_args.input_file = parsed_args.input_file[0]
    parsed_args.output_file = parsed_args.output_file[0]

    # Runs the dictionary learning algorithm on each file with
    # the given parameters and saves the results in the given files.
    generate_neurons_io_handler(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.config_filename
    )

    return(0)