Source code for nanshe.io.hdf5.search

"""
The module ``search`` provides glob paths to search for content in a HDF5 file.

===============================================================================
Overview
===============================================================================
The module implements a strategy similar to Python's |glob|_ module for HDF5
files. In short, it uses regex patterns to match as many possible paths as it
can.

.. |glob| replace:: ``glob``
.. _glob: http://docs.python.org/2/library/glob.html

===============================================================================
API
===============================================================================
"""


__author__ = "John Kirkham <kirkhamj@janelia.hhmi.org>"
__date__ = "$Jun 18, 2014 20:06:44 EDT$"


import re
import collections
import itertools

import h5py

from nanshe.util import iters


# Need in order to have logging information no matter what.
from nanshe.util import prof


# Get the logger
trace_logger = prof.getTraceLogger(__name__)


[docs]@prof.log_call(trace_logger) def get_matching_paths(a_filehandle, a_path_pattern): """ Looks for existing paths that match the full provide pattern path. Returns a list of matches for the given file handle. Args: a_filehandle(h5py.File): an HDF5 file. a_path_pattern(str): an internal path (with patterns for each group) for the HDF5 file. Returns: (list): a list of matching paths. """ current_pattern_group_matches = [] if (isinstance(a_filehandle, h5py.Group) and a_path_pattern): current_group = a_filehandle a_path_pattern = a_path_pattern.strip("/") to_split = a_path_pattern.find("/") if to_split != -1: current_path = a_path_pattern[:to_split] next_path = a_path_pattern[1 + to_split:] else: current_path, next_path = a_path_pattern, "" current_pattern_group_regex = re.compile("/" + current_path + "/") for each_group in current_group: if current_pattern_group_regex.match("/" + each_group + "/") is not None: next_group = current_group[each_group] next_pattern_group_matches = get_matching_paths( next_group, next_path ) for each_next_pattern_group_match in next_pattern_group_matches: current_pattern_group_matches.append( "/" + each_group + each_next_pattern_group_match ) else: current_pattern_group_matches = [""] return(current_pattern_group_matches)
[docs]@prof.log_call(trace_logger) def get_matching_paths_groups(a_filehandle, a_path_pattern): """ Looks for parts of the path pattern and tries to match them in order. Returns a list of matches that can be combined to yield acceptable matches for the given file handle. Note: This works best when a tree structure is created systematically in HDF5. Then, this will recreate what the tree structure could and may contain. Args: a_filehandle(h5py.File): an HDF5 file. a_path_pattern(str): an internal path (with patterns for each group) for the HDF5 file. Returns: (list): a list of matching paths. """ def get_matching_paths_groups_recursive(a_filehandle, a_path_pattern): current_pattern_group_matches = [] if (isinstance(a_filehandle, h5py.Group) and a_path_pattern): current_pattern_group_matches.append(collections.OrderedDict()) current_group = a_filehandle a_path_pattern = a_path_pattern.strip("\b").strip("/") to_split = a_path_pattern.find("/") if to_split != -1: current_path = a_path_pattern[:to_split] next_path = a_path_pattern[1 + to_split:] else: current_path, next_path = a_path_pattern, "" current_pattern_group_regex = re.compile("/" + current_path + "/") for each_group in current_group: if current_pattern_group_regex.match("/" + each_group + "/") is not None: next_group = current_group[each_group] next_pattern_group_matches = get_matching_paths_groups_recursive( next_group, next_path ) current_pattern_group_matches[0][each_group] = None while (len(current_pattern_group_matches) - 1) < len(next_pattern_group_matches): current_pattern_group_matches.append( collections.OrderedDict() ) for i, each_next_pattern_group_matches in enumerate( next_pattern_group_matches, start=1 ): for each_next_pattern_group_match in each_next_pattern_group_matches: current_pattern_group_matches[i][each_next_pattern_group_match] = None else: current_pattern_group_matches = [] return(current_pattern_group_matches) groups = get_matching_paths_groups_recursive(a_filehandle, a_path_pattern) new_groups = [] for i in iters.irange(len(groups)): new_groups.append(list(groups[i])) groups = new_groups return(groups)
[docs]@prof.log_call(trace_logger) def get_matching_grouped_paths(a_filehandle, a_path_pattern): """ Looks for existing paths that match the full provide pattern path. Returns a list of matches as keys and whether they are found in the HDF5 file or not. Args: a_filehandle(h5py.File): an HDF5 file. a_path_pattern(str): an internal path (with patterns for each group) for the HDF5 file. Returns: (list): an ordered dictionary with possible paths that fit the pattern and whether they are found. """ paths_found = collections.OrderedDict() for each_path_components in itertools.product( *get_matching_paths_groups(a_filehandle, a_path_pattern) ): each_path = "/" + "/".join([_ for _ in each_path_components]) paths_found[each_path] = None paths_found = list(paths_found.keys()) return(paths_found)
[docs]@prof.log_call(trace_logger) def get_matching_grouped_paths_found(a_filehandle, a_path_pattern): """ Looks for existing paths that match the full provide pattern path. Returns a list of matches as keys and whether they are found in the HDF5 file or not. Args: a_filehandle(h5py.File): an HDF5 file. a_path_pattern(str): an internal path (with patterns for each group) for the HDF5 file. Returns: (collections.OrderedDict): an ordered dictionary with possible paths that fit the pattern and whether they are found. """ paths_found = collections.OrderedDict() for each_path_components in itertools.product( *get_matching_paths_groups(a_filehandle, a_path_pattern) ): each_path = "/" + "/".join([_ for _ in each_path_components]) paths_found[each_path] = (each_path in a_filehandle) return(paths_found)