MERFISH whole brain spatial transcriptomics (part 2a)#

In part 1, we explored two examples looking at the expression of canonical neurotransmitter transporter genes and gene Tac2 in the one coronal section. In this notebook, we will prepare data so that we can repeat the examples for all cells spanning the whole brain. This notebook takes ~10 seconds to run.

The results from this notebook has already been cached and saved. As such, if needed you can skip this notebook and continue with part 2b.

You need to be connected to the internet to run this notebook and that you have downloaded the log2 expression matrices associated with MERFISH-C57BL6J-638850-combined dataset.

import os
import pandas as pd
import numpy as np
import anndata
import time
import json
import requests

The prerequisite for running this notebook is that the data have been downloaded to local directory maintaining the organization from the manifest.json. Change the download_base variable to where you have downloaded the data in your system.

version = '20231215'
download_base = '../../abc_download_root'

use_local_cache = False
manifest_path = 'releases/%s/manifest.json' % version

if not use_local_cache :
    url = 'https://allen-brain-cell-atlas.s3-us-west-2.amazonaws.com/' + manifest_path
    manifest = json.loads(requests.get(url).text)
else :
    file = os.path.join(download_base,manifest_path)
    with open(file,'rb') as f:
        manifest = json.load(f)
    
metadata = manifest['file_listing']['MERFISH-C57BL6J-638850']['metadata']
view_directory = os.path.join( download_base, 
                               manifest['directory_listing']['MERFISH-C57BL6J-638850']['directories']['metadata']['relative_path'], 
                              'views')
cache_views = False
if cache_views :
    os.makedirs( view_directory, exist_ok=True )
rpath = metadata['cell_metadata']['files']['csv']['relative_path']
file = os.path.join( download_base, rpath)
cell = pd.read_csv(file, dtype={'cell_label':str})
cell.set_index('cell_label',inplace=True)
print(len(cell))
3938808
expression_matrices = manifest['file_listing']['MERFISH-C57BL6J-638850']['expression_matrices']
rpath = expression_matrices['C57BL6J-638850']['log2']['files']['h5ad']['relative_path']
file = os.path.join( download_base, rpath)
print(file)
../../abc_download_root/expression_matrices/MERFISH-C57BL6J-638850/20230830/C57BL6J-638850-log2.h5ad
adata = anndata.read_h5ad(file,backed='r')
gene = adata.var
ntgenes = ['Slc17a7','Slc17a6','Slc17a8','Slc32a1','Slc6a5','Slc18a3','Slc6a3','Slc6a4','Slc6a2']
exgenes = ['Tac2']
gnames = ntgenes + exgenes
pred = [x in gnames for x in gene.gene_symbol]
gene_filtered = gene[pred]
gene_filtered
gene_symbol transcript_identifier
gene_identifier
ENSMUSG00000030500 Slc17a6 ENSMUST00000032710
ENSMUSG00000037771 Slc32a1 ENSMUST00000045738
ENSMUSG00000025400 Tac2 ENSMUST00000026466
ENSMUSG00000039728 Slc6a5 ENSMUST00000056442
ENSMUSG00000070570 Slc17a7 ENSMUST00000085374
ENSMUSG00000019935 Slc17a8 ENSMUST00000020102
ENSMUSG00000021609 Slc6a3 ENSMUST00000022100
ENSMUSG00000020838 Slc6a4 ENSMUST00000021195
start = time.process_time()
gdata = adata[:,gene_filtered.index].to_df()
print("time taken: ", time.process_time() - start)
time taken:  6.662516814
# change columns from index to gene symbol
gdata.columns = gene_filtered.gene_symbol
pred = pd.notna(gdata[gdata.columns[0]])
gdata = gdata[pred].copy(deep=True)
print(len(gdata))
4334174
if cache_views :
    file = os.path.join( view_directory, 'example_genes_all_cells_expression.csv')
    gdata.to_csv( file )

Close h5ad file and clean up

adata.file.close()
del adata