Source code for bmtk.utils.sonata.file

# Copyright 2017. Allen Institute. All rights reserved
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
# following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following
# disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
from . import utils
from .file_root import NodesRoot, EdgesRoot


[docs]class File(object):
    def __init__(self, data_files, data_type_files, mode='r', gid_table=None, require_magic=True):
        if mode != 'r':
            raise Exception('Currently only read mode is supported.')

        self._data_files = utils.listify(data_files)
        self._data_type_files = utils.listify(data_type_files)

        # Open and check HDF5 file(s)
        self._h5_file_handles = [utils.load_h5(f, mode) for f in self._data_files]
        if require_magic:
            map(utils.check_magic, self._h5_file_handles)  # Check magic attribute in h5 files

        # Check version number
        avail_versions = set(map(utils.get_version, self._h5_file_handles))
        if len(avail_versions) == 1:
            self._version = list(avail_versions)[0]
        elif len(avail_versions) > 1:
            # TODO: log as warning
            print('Warning: Passing in multiple hdf5 files of different version')
            self._version = ','.join(avail_versions)
        else:
            self._version = utils.VERSION_NA

        self._csv_file_handles = [(f, utils.load_csv(f)) for f in self._data_type_files]

        self._has_nodes = False
        self._nodes = None  # /nodes object
        self._nodes_groups = []  # list of all hdf5 /nodes group
        self._node_types_dataframes = []  # list of all csv node-types dataframe

        self._has_edges = False
        self._edges = None  # /edges object
        self._edges_groups = []  # list of all hdf5 /edges group
        self._edge_types_dataframes = []  # list of csv edge-types dataframes

        # for multiple inputs sort into edge files and node files
        self._sort_types_file()
        self._sort_h5_files()

        if not (self._has_nodes or self._has_edges):
            raise Exception('Could not find neither nodes nor edges for the given file(s).')

        if self._has_nodes:
            self._nodes = NodesRoot(nodes=self._nodes_groups, node_types=self._node_types_dataframes, gid_table=gid_table)

        if self._has_edges:
            self._edges = EdgesRoot(edges=self._edges_groups, edge_types=self._edge_types_dataframes)

    @property
    def nodes(self):
        return self._nodes

    @property
    def has_nodes(self):
        return self._has_nodes

    @property
    def edges(self):
        return self._edges

    @property
    def has_edges(self):
        return self._has_edges

    @property
    def version(self):
        return self._version

    def _sort_types_file(self):
        # TODO: node/edge type_id columnn names should not be hardcoded
        for filename, df in self._csv_file_handles:
            has_node_type_id = 'node_type_id' in df.columns
            has_edge_type_id = 'edge_type_id' in df.columns
            if has_node_type_id and has_edge_type_id:
                # TODO: users may be creating their own dataframe and thus not have a filename
                raise Exception('types file {} has both node_types_id and edge_types_id column.'.format(filename))
            elif has_node_type_id:
                self._node_types_dataframes.append(df)
            elif has_edge_type_id:
                self._edge_types_dataframes.append(df)
            else:
                # TODO: if strict this should fail immedietely
                print('Warning: Could not determine if file {} was an edge-types or node-types file. Ignoring'.format(filename))

    def _sort_h5_files(self):
        for h5 in self._h5_file_handles:
            has_nodes = '/nodes' in h5
            has_edges = '/edges' in h5
            if not (has_nodes or has_edges):
                print('File {} contains neither nodes nor edges. Ignoring'.format(h5.filename))
            else:
                if has_nodes:
                    self._nodes_groups.append(h5)
                    self._has_nodes = True
                if has_edges:
                    self._edges_groups.append(h5)
                    self._has_edges = True