Source code for bmtk.utils.io.spike_trains

import os
import sys
import csv

import h5py
import pandas as pd
import numpy as np
from bmtk.utils.sonata.utils import add_hdf5_magic, add_hdf5_version


[docs]class SpikeTrainWriter(object):
[docs] class TmpFileMetadata(object): def __init__(self, file_name, sort_order=None): self.file_name = file_name self.sort_order = sort_order
def __init__(self, tmp_dir, mpi_rank=0, mpi_size=1): # For NEST/NEURON based simulations it is prefereable not to use mpi4py, so let the parent simulator determine # MPI rank and size self._mpi_rank = mpi_rank self._mpi_size = mpi_size # used to temporary save spike files since for large simulations saving spikes into memory can crash the # system. Requires the user to create the directory self._tmp_dir = tmp_dir if self._tmp_dir is None or not os.path.exists(self._tmp_dir): raise Exception('Directory path {} does not exists'.format(self._tmp_dir)) self._all_tmp_files = [self.TmpFileMetadata(self._get_tmp_filename(r)) for r in range(mpi_size)] # TODO: Determine best buffer size. self._tmp_file_handle = open(self._all_tmp_files[mpi_rank].file_name, 'w') self._tmp_spikes_handles = [] # used when sorting mulitple file self._spike_count = -1 # Nest gid files uses tab seperators and a different order for tmp spike files. self.delimiter = ' ' # delimiter for temporary file self.time_col = 0 self.gid_col = 1 def _get_tmp_filename(self, rank): return os.path.join(self._tmp_dir, '_bmtk_tmp_spikes_{}.csv'.format(rank)) def _count_spikes(self, recount=False): if self._mpi_rank == 0: if self._spike_count > -1 and not recount: return self._spike_count self._spike_count = 0 for tmp_file in self._all_tmp_files: with open(tmp_file.file_name, 'r') as csvfile: csv_reader = csv.reader(csvfile, delimiter=self.delimiter) self._spike_count += sum(1 for _ in csv_reader) def _sort_tmp_file(self, filedata, sort_order): # For now load spikes into pandas, it's the fastest way but may be an issue with memory if sort_order is None or filedata.sort_order == sort_order: return file_name = filedata.file_name tmp_spikes_ds = pd.read_csv(file_name, sep=' ', names=['time', 'gid']) tmp_spikes_ds = tmp_spikes_ds.sort_values(by=[sort_order, 'time']) tmp_spikes_ds.to_csv(file_name, sep=' ', index=False, header=False) filedata.sort_order = sort_order def _next_spike(self, rank): try: val = next(self._tmp_spikes_handles[rank]) return val[0], val[1], rank except StopIteration: return None
[docs] def add_spike(self, time, gid): self._tmp_file_handle.write('{:.6f} {}\n'.format(time, gid))
[docs] def add_spikes(self, times, gid): for t in times: self.add_spike(t, gid)
[docs] def add_spikes_file(self, file_name, sort_order=None): self._all_tmp_files.append(self.TmpFileMetadata(file_name, sort_order))
def _sort_files(self, sort_order, sort_column, file_write_fnc): self._tmp_spikes_handles = [] for fdata in self._all_tmp_files: self._sort_tmp_file(fdata, sort_order) self._tmp_spikes_handles.append(csv.reader(open(fdata.file_name, 'r'), delimiter=self.delimiter)) spikes = [] for rank in range(len(self._tmp_spikes_handles)): # range(self._mpi_size): spike = self._next_spike(rank) if spike is not None: spikes.append(spike) # Iterate through all the ranks and find the first spike. Write that spike/gid to the output, then # replace that data point with the next spike on the selected rank indx = 0 while spikes: # find which rank has the first spike selected_index = 0 selected_val = spikes[0][sort_column] for i, spike in enumerate(spikes[1:]): if spike[sort_column] < selected_val: selected_index = i + 1 selected_val = spike[sort_column] # write the spike to the file row = spikes.pop(selected_index) file_write_fnc(float(row[self.time_col]), int(row[self.gid_col]), indx) indx += 1 # get the next spike on that rank and replace in spikes table another_spike = self._next_spike(row[2]) if another_spike is not None: spikes.append(another_spike) def _merge_files(self, file_write_fnc): indx = 0 for fdata in self._all_tmp_files: if not os.path.exists(fdata.file_name): continue with open(fdata.file_name, 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=self.delimiter) for row in csv_reader: file_write_fnc(float(row[self.time_col]), int(row[self.gid_col]), indx) indx += 1 def _to_file(self, file_name, sort_order, file_write_fnc): if sort_order is None: sort_column = 0 elif sort_order == 'time': sort_column = self.time_col elif sort_order == 'gid': sort_column = self.gid_col else: raise Exception('Unknown sort order {}'.format(sort_order)) # TODO: Need to make sure an MPI_Barrier is called beforehand self._tmp_file_handle.close() if self._mpi_rank == 0: if sort_order is not None: self._sort_files(sort_order, sort_column, file_write_fnc) else: self._merge_files(file_write_fnc)
[docs] def to_csv(self, csv_file, sort_order=None, gid_map=None): # TODO: Need to call flush and then barrier if self._mpi_rank == 0: # For the single rank case don't just copy the tmp-csv to the new name. It will fail if user calls to_hdf5 # or to_nwb after calling to_csv. self._count_spikes() csv_handle = open(csv_file, 'w') csv_writer = csv.writer(csv_handle, delimiter=' ') def file_write_fnc_identity(time, gid, indx): csv_writer.writerow([time, gid]) def file_write_fnc_transform(time, gid, indx): # For the case when NEURON/NEST ids don't match with the user's gid table csv_writer.writerow([time, gid_map[gid]]) file_write_fnc = file_write_fnc_identity if gid_map is None else file_write_fnc_transform self._to_file(csv_file, sort_order, file_write_fnc) csv_handle.close()
# TODO: Let user pass in in barrier and use it here
[docs] def to_nwb(self, nwb_file): raise NotImplementedError
[docs] def to_hdf5(self, hdf5_file, sort_order=None, gid_map=None): if self._mpi_rank == 0: with h5py.File(hdf5_file, 'w') as h5: add_hdf5_magic(h5) add_hdf5_version(h5) self._count_spikes(recount=True) spikes_grp = h5.create_group('/spikes') spikes_grp.attrs['sorting'] = 'none' if sort_order is None else sort_order time_ds = spikes_grp.create_dataset('timestamps', shape=(self._spike_count,), maxshape=(None,), dtype=np.float64) gid_ds = spikes_grp.create_dataset('gids', shape=(self._spike_count,), maxshape=(None,), dtype=np.uint64) def resize_data(): # There have been (unreproducable) mpi conditons where _count_spikes() is not correct, even with # proper barriers and file flushing. Add a quick fix in case when converting csv to hdf5. self._count_spikes(recount=True) time_ds.resize((self._spike_count, )) gid_ds.resize((self._spike_count, )) def file_write_fnc_identity(time, gid, indx): if indx >= self._spike_count: resize_data() time_ds[indx] = time gid_ds[indx] = gid def file_write_fnc_transform(time, gid, indx): if indx >= self._spike_count: resize_data() time_ds[indx] = time gid_ds[indx] = gid_map[gid] file_write_fnc = file_write_fnc_identity if gid_map is None else file_write_fnc_transform self._to_file(hdf5_file, sort_order, file_write_fnc)
# TODO: Need to make sure a barrier is used here (before close is called)
[docs] def flush(self): self._tmp_file_handle.flush()
[docs] def close_tmp_file(self): self._tmp_file_handle.close()
[docs] def close(self): if self._mpi_rank == 0: for tmp_file in self._all_tmp_files: if os.path.exists(tmp_file.file_name): os.remove(tmp_file.file_name)
[docs]class PoissonSpikesGenerator(object): def __init__(self, gids, firing_rate, tstart=0.0, tstop=1000.0): self._gids = gids self._firing_rate = firing_rate / 1000.0 self._tstart = tstart self._tstop = tstop
[docs] def to_hdf5(self, file_name, sort_order='gid'): if sort_order == 'gid': gid_list = [] times_list = [] if sort_order == 'gid': for gid in self._gids: c_time = self._tstart while c_time < self._tstop: interval = -np.log(1.0 - np.random.uniform()) / self._firing_rate c_time += interval gid_list.append(gid) times_list.append(c_time) with h5py.File(file_name, 'w') as h5: h5.create_dataset('/spikes/gids', data=gid_list, dtype=np.uint) h5.create_dataset('/spikes/timestamps', data=times_list, dtype=float) h5['/spikes'].attrs['sorting'] = 'by_gid' else: raise NotImplementedError
[docs]class SpikesInput(object):
[docs] def get_spikes(self, gid): raise NotImplementedError()
[docs] @staticmethod def load(name, module, input_type, params): module_lc = module.lower() if module_lc == 'nwb': return SpikesInputNWBv1(name, module, input_type, params) elif module_lc == 'h5' or module_lc == 'hdf5': return SpikesInputH5(name, module, input_type, params) elif module_lc == 'csv': return SpikesInputCSV(name, module, input_type, params) else: raise Exception('Unable to load spikes for module type {}'.format(module))
[docs]class SpikesInputNWBv1(SpikesInput): def __init__(self, name, module, input_type, params): self.input_file = params['input_file'] self._h5_handle = h5py.File(self.input_file, 'r') self.trial = params['trial'] self._trial_grp = self._h5_handle['processing'][self.trial]['spike_train']
[docs] def get_spikes(self, gid): return self._trial_grp[str(gid)]['data']
[docs]class SONATAIndexer(object):
[docs] def get_spikes(self, gid): raise NotImplementedError()
[docs]class DictIndexedGIDs(SONATAIndexer): def __init__(self, spikes_input_h5): self._gid_indicies = {} self._parent = spikes_input_h5 indx_beg = 0 c_gid = self._parent.gids[0] for indx, gid in enumerate(self._parent.gids): # go through the gids dataset, determine slices for each gid if gid != c_gid: self._gid_indicies[c_gid] = slice(indx_beg, indx) c_gid = gid indx_beg = indx self._gid_indicies[c_gid] = slice(indx_beg, indx+1) # saves the last entry
[docs] def get_spikes(self, gid): if gid in self._gid_indicies: return self._parent.timestamps[self._gid_indicies[gid]] else: return []
[docs]class DFIndexedGIDs(SONATAIndexer): def __init__(self, spikes_input_h5): self._parent = spikes_input_h5 index_df = pd.DataFrame(data={'gids': self._parent.gids}, ) index_df.drop_duplicates(inplace=True) index_df = index_df.stack().reset_index() index_df.columns = ['indx_beg', 'tmp', 'gid'] index_df.set_index('gid', inplace=True) index_df = index_df.drop('tmp', axis=1) index_df['indx_end'] = index_df['indx_beg'].shift(-1).fillna(len(self._parent.gids)).astype(np.int64) self._gid_indicies = index_df # index_df.to_dict(orient='index')
[docs] def get_spikes(self, gid): if gid in self._gid_indicies.index: indx_beg = self._gid_indicies.loc[gid]['indx_beg'] indx_end = self._gid_indicies.loc[gid]['indx_end'] return self._parent.timestamps[indx_beg:indx_end] else: return []
[docs]class HDF5IndexedGIDs(SONATAIndexer): def __init__(self, spikes_input_h5): raise NotImplementedError()
[docs]class UnindexedGIDs(SONATAIndexer): def __init__(self, spikes_input_h5): raise NotImplementedError()
[docs]class SpikesInputH5(SpikesInput): def __init__(self, name, module, input_type, params): self._input_file = params['input_file'] self._h5_handle = h5py.File(self._input_file, 'r') self._gid_ds = self._h5_handle['/spikes/gids'] self._timestamps_ds = self._h5_handle['/spikes/timestamps'] self._sort_order = self._h5_handle['/spikes'].attrs.get('sorting', None) if sys.version_info[0] >= 3 and isinstance(self._sort_order, bytes) and self._sort_order is not None: # h5py attributes return str in py 2, bytes in py 3 self._sort_order = self._sort_order.decode() # Create an index for fetching spike-trains based on gid if 'index' in self._h5_handle['/spikes'] and isinstance(self._h5_handle['/spikes/index'], h5py.Dataset): # In the case the index is built into the hdf5 file self._gid_index = HDF5IndexedGIDs(self) elif self._sort_order in ['gid', 'gids', 'by_gid']: # In the case when the sort_order == 'gid' # self._gid_index = DictIndexedGIDs(self) self._gid_index = DFIndexedGIDs(self) else: # In the case where the spike-trains are sorted by time or unsorted self._gid_index = UnindexedGIDs(self) @property def gids(self): return self._gid_ds @property def timestamps(self): return self._timestamps_ds
[docs] def get_spikes(self, gid): return self._gid_index.get_spikes(gid)
[docs]class SpikesInputCSV(SpikesInput): def __init__(self, name, module, input_type, params): self._spikes_df = pd.read_csv(params['input_file'], index_col='gid', sep=' ')
[docs] def get_spikes(self, gid): spike_times_str = self._spikes_df.iloc[gid]['spike-times'] return np.array(spike_times_str.split(','), dtype=float)