Human-Mammalian Brain - Basal Ganglia 10X snRANSeq analysis: gene expression#

In this notebook we’ll explore some gene expressions and combine them with the cell metadata we showed in the previous clustering analysis tutorial.

You need to be connected to the internet to run this notebook or connected to a cache that has the HMBA-BG data downloaded already.

The notebook presented here shows quick visualizations from precomputed metadata in the atlas. For examples on accessing the expression matrices, specifically selecting genes from expression matrices, see the general Accessing expression data tutorial.

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache
from abc_atlas_access.abc_atlas_cache.anndata_utils import get_gene_data

We will interact with the data using the AbcProjectCache. This cache object tracks which data has been downloaded and serves the path to the requested data on disk. For metadata, the cache can also directly serve up a Pandas DataFrame. See the getting_started notebook for more details on using the cache including installing it if it has not already been.

Change the download_base variable to where you would like to download the data to your system.

download_base = Path('../../data/abc_atlas')
abc_cache = AbcProjectCache.from_cache_dir(download_base)

abc_cache.current_manifest
'releases/20250531/manifest.json'

Create the expanded cell metadata as was done previously in the cluster annotation tutorial tutorial.

# Load the cell metadata.
cell = abc_cache.get_metadata_dataframe(
    directory='HMBA-10xMultiome-BG-Aligned',
    file_name='cell_metadata',
    dtype={'cell_label': str}
).set_index('cell_label')
donor = abc_cache.get_metadata_dataframe(
    directory='HMBA-10xMultiome-BG-Aligned',
    file_name='donor'
).set_index('donor_label')
library = abc_cache.get_metadata_dataframe(
    directory='HMBA-10xMultiome-BG-Aligned',
    file_name='library'
).set_index('library_label')
value_sets = abc_cache.get_metadata_dataframe(
    directory='HMBA-10xMultiome-BG-Aligned',
    file_name='value_sets'
).set_index('label')

cell_extended = cell.join(donor, on='donor_label')
cell_extended = cell_extended.join(
    library, on='library_label',
    rsuffix='_library_table'
)

def extract_value_set(cell_metadata_df: pd.DataFrame, input_value_set: pd.DataFrame, input_value_set_label: str):
    """Add color and order columns to the cell metadata dataframe based on the input
    value set.

    Columns are added as {input_value_set_label}_color and {input_value_set_label}_order.

    Parameters
    ----------
    cell_metadata_df : pd.DataFrame
        DataFrame containing cell metadata.
    input_value_set : pd.DataFrame
        DataFrame containing the value set information.
    input_value_set_label : str
        The the column name to extract color and order information for. will be added to the cell metadata.
    """
    cell_metadata_df[f'{input_value_set_label}_color'] = input_value_set[
        input_value_set['field'] == input_value_set_label
    ].loc[cell_metadata_df[input_value_set_label]]['color_hex_triplet'].values
    cell_metadata_df[f'{input_value_set_label}_order'] = input_value_set[
        input_value_set['field'] == input_value_set_label
    ].loc[cell_metadata_df[input_value_set_label]]['order'].values

extract_value_set(cell_extended, value_sets, 'region_of_interest_label')
extract_value_set(cell_extended, value_sets, 'species_common_name')
extract_value_set(cell_extended, value_sets, 'species_scientific_name')
extract_value_set(cell_extended, value_sets, 'donor_sex')

# Load the cluster memembership metadata and combine the data with the cell data.
cell_2d_embedding_coordinates = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cell_2d_embedding_coordinates'
).set_index('cell_label')
cell_extended = cell_extended.join(cell_2d_embedding_coordinates)
cell_extended = cell_extended.sample(frac=1)

cell_to_cluster_membership = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cell_to_cluster_membership',
).set_index('cell_label')

cluster = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cluster',
    dtype={'number_of_cells': 'Int64'}
).rename(columns={'label': 'cluster_annotation_term_label'}).set_index('cluster_annotation_term_label')
cluster_annotation_term_set = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cluster_annotation_term_set'
).rename(columns={'label': 'cluster_annotation_term_label'})

cluster_annotation_term = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cluster_annotation_term',
).rename(columns={'label': 'cluster_annotation_term_label'}).set_index('cluster_annotation_term_label')
cluster_annotation_term_with_cells = cluster_annotation_term.join(cluster, how='left')

cell_to_cluster_membership = cell_to_cluster_membership[cell_to_cluster_membership['cluster_label'].isin(cluster_annotation_term.index)]

cluster_to_cluster_annotation_membership = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cluster_to_cluster_annotation_membership',
).set_index('cluster_annotation_term_label')
membership_with_cluster_info = cluster_to_cluster_annotation_membership.join(cluster_annotation_term_with_cells, rsuffix='_anno_term').reset_index()
membership_groupby = membership_with_cluster_info.groupby(
    ['cluster_alias', 'cluster_annotation_term_set_name']
)

cluster_annotation_term_set = abc_cache.get_metadata_dataframe(
    directory='HMBA-BG-taxonomy-CCN20250428',
    file_name='cluster_annotation_term_set'
).rename(columns={'label': 'cluster_annotation_term_label'})

# term_sets = abc_cache.get_metadata_dataframe(directory='WHB-taxonomy', file_name='cluster_annotation_term_set').set_index('label')
cluster_details = membership_groupby['cluster_annotation_term_name'].first().unstack()
cluster_order = membership_groupby['term_order'].first().unstack()
cluster_order.sort_values(['Neighborhood', 'Class', 'Subclass', 'Group', 'Cluster'], inplace=True)
cluster_order.rename(
    columns={'Neighborhood': 'Neighborhood_order',
             'Class': 'Class_order',
             'Subclass': 'Subclass_order',
             'Group': 'Group_order',
             'Cluster': 'Cluster_order'},
    inplace=True
)

cluster_colors = membership_groupby['color_hex_triplet'].first().unstack()
cluster_colors = cluster_colors[cluster_annotation_term_set['name']]
cluster_colors.sort_values(
    ['Neighborhood', 'Class', 'Subclass', 'Group', 'Cluster'],
    inplace=True
)

cell_extended = cell_extended.join(cell_to_cluster_membership, how='inner')
cell_extended = cell_extended[~pd.isna(cell_extended['cluster_alias'])]
cell_extended = cell_extended.join(cluster_details, on='cluster_alias')
cell_extended = cell_extended.join(cluster_colors, on='cluster_alias', rsuffix='_color')
cell_extended = cell_extended.join(cluster_order, on='cluster_alias')

cell_extended.head(5)
/Users/chris.morrison/src/miniconda3/envs/abc_atlas_access/lib/python3.12/site-packages/abc_atlas_access/abc_atlas_cache/abc_project_cache.py:429: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.
  return pd.read_csv(path, **kwargs)
cell_barcode donor_label barcoded_cell_sample_label library_label alignment_job_id doublet_score umi_count feature_matrix_label dataset_label abc_sample_id ... Neighborhood_color Class_color Subclass_color Group_color Cluster_color Class_order Cluster_order Group_order Neighborhood_order Subclass_order
cell_label
CACATACAGGGATGCG-936_C03 CACATACAGGGATGCG Q21.26.002 936_C03 L8XR_211118_02_C04 44c17a26ca9c6f204075ddc6f10e1029fdcea4c5 0.020000 12196.0 HMBA-10xMultiome-BG-Aligned HMBA-10xMultiome-BG-Aligned 981e6fb1-a376-4786-b87a-0d274cf603be ... #19613b #d0b83c #253c8c #aec7e8 #2fd6d6 10 1189 53 3 32
GTCTAATCAGCTTAGC-2054_A08 GTCTAATCAGCTTAGC QM23.50.003 2054_A08 L8XR_240208_01_B04 36c190e9e62974ee4454085dc4d3b27b83c7739d 0.011236 4245.0 HMBA-10xMultiome-BG-Aligned HMBA-10xMultiome-BG-Aligned d591e3d3-56f0-4286-8434-fef4250f1cd2 ... #f2ca7d #8605d4 #def476 #488edc #62620e 3 146 9 1 7
AGTGCGGAGGCTACTG-2540_A02 AGTGCGGAGGCTACTG H24.30.007 2540_A02 L8XR_241031_21_B02 98123d9e5e7a10180d2694f55c4f538179489b6e 0.035714 5658.0 HMBA-10xMultiome-BG-Aligned HMBA-10xMultiome-BG-Aligned 8ea6fa29-dbe9-4171-ab22-640b419fe430 ... #f2ca7d #8605d4 #def476 #488edc #a9de1a 3 142 9 1 7
AATCTCAAGCAAACCT-948_C03 AATCTCAAGCAAACCT QM21.26.003 948_C03 L8XR_211124_02_G10 86af4e5bdd31f92b8a1e2d7cc3d65c513b954d91 0.050000 26865.0 HMBA-10xMultiome-BG-Aligned HMBA-10xMultiome-BG-Aligned d3f278c5-803d-4224-8782-17f41fd70f03 ... #91f4bb #0433b4 #77f0ca #854098 #2e50ba 6 449 23 2 16
GAACTTATCCCTGACT-847_A04 GAACTTATCCCTGACT H18.30.001 847_A04 L8XR_211007_02_H02 237fca61a138a6c86f149b0271a293f75f8e4a0b 0.020000 26418.0 HMBA-10xMultiome-BG-Aligned HMBA-10xMultiome-BG-Aligned 85e3457b-8f6e-4868-907e-745c2a274ff4 ... #19613b #d0b83c #253c8c #aec7e8 #39c6ab 10 1185 53 3 32

5 rows × 53 columns

Single cell transcriptomes#

The ~2 million, 10X single cell dataset of HMBA-BG is available in two separate packages. The first contains a single aligned h5ad file containing ~16k genes that have been aligned across all species. We use this single aligned dataset in this notebook. The other package is split across each species with a separate h5ad file for each species. These individually containing roughly ~30k genes for each species.

Below we show some interactions with data from the 10X expression matrices in the HMBA-BG dataset. For a deeper dive into how to access specific gene data from the expression matrices, take a look at general accessing expression data tutorial.

First, we list the available metadata in the HMBA-BG 10X dataset again.

abc_cache.list_metadata_files('HMBA-10xMultiome-BG-Aligned')
['cell_metadata',
 'donor',
 'example_gene_expression',
 'gene',
 'library',
 'value_sets']

We first load the gene data for the Aligned dataset covering all species in the BG dataset.

aligned_gene = abc_cache.get_metadata_dataframe(
    directory='HMBA-10xMultiome-BG-Aligned',
    file_name='gene'
).set_index('gene_identifier')
print("Number of aligned genes = ", len(aligned_gene))
aligned_gene.head(5)
Number of aligned genes =  16630
gene_symbol description molecular_type
gene_identifier
NCBIGene:9380 GRHPR glyoxylate and hydroxypyruvate reductase protein-coding
NCBIGene:6603 SMARCD2 SWI/SNF related, matrix associated, actin depe... protein-coding
NCBIGene:148103 ZNF599 zinc finger protein 599 protein-coding
NCBIGene:92691 TMEM169 transmembrane protein 169 protein-coding
NCBIGene:3235 HOXD9 homeobox D9 protein-coding

Below we list the genes we load and the example method used to load the expression for these specific genes from the h5ad file. Note that we provide a the set of example gene expressions as a csv file for brevity in this tutorial. To process and extract the gene expressions for yourself, uncomment the code block below. More details on how to extract specific genes from the data see our accessing gene expression data tutorial

gene_names = ['SLC17A6', 'SLC32A1', 'PTPRC', 'PLP1', 'AQP4', 'DRD1', 'DRD2']

"""
aligned_gene_data = get_gene_data(
    abc_atlas_cache=abc_cache,
    all_cells=cell_extended,
    all_genes=aligned_gene,
    selected_genes=gene_names
)
"""
'\naligned_gene_data = get_gene_data(\n    abc_atlas_cache=abc_cache,\n    all_cells=cell_extended,\n    all_genes=aligned_gene,\n    selected_genes=gene_names\n)\n'

Instead of processing the gene expressions, we load a pre-processed file.

aligned_gene_data = abc_cache.get_metadata_dataframe(
    'HMBA-10xMultiome-BG-Aligned',
    'example_gene_expression'
).set_index('cell_label')

Next, we’ll concatenate the gene data together and merge them into our cell metadata.

cell_extended_with_genes = cell_extended.join(aligned_gene_data)

Example use cases#

Note these genes are examples. A final selection for the released notebook can be substituted later on.

In this section, we show a use case with the example genes SLC17A6, SLC32A1, PTPRC, PLP1, AQP4, DRD1, and DRD2. These genes were selected because they are marker genes for glutamatergic (SLC17A6) and GABAergic (SLC32A1) neurons, immune cells (PTPRC), oligodendrocytes (PLP1), astrocytes (AQP4), and D1/D2 medium spiny neurons (DRD1, DRD2). “Marker genes” have much higher expression in the specified cell type or anatomic structure when compared to all other cells, and in many cases are functionally relevant for those cell types.

We define a helper functions aggregate_by_metadata to compute the average expression for a given category and later plot_umap to plot cells in a UMAP colorized by metadata or expression values similar to what was used in part 1.

The helper function below creates a heatmap showing the relation between various parameters in the combined cell metadata and the genes we loaded.

def plot_species_heatmap(
    df,
    gnames,
    value,
    species_list=None,
    sort=False,
    fig_width=8,
    fig_height=4,
    vmax=None,
    cmap=plt.cm.magma_r
):
    if species_list is None:
        species_list = df['species_common_name'].unique()

    fig, ax = plt.subplots(1, len(species_list))
    fig.set_size_inches(fig_width, fig_height)

    all_unique_values = df[value].unique()
    try:
        order = df[f'{value}_order'].unique()
        all_unique_values = all_unique_values[np.argsort(order)]
    except KeyError:
        order = None

    for idx, species in enumerate(species_list):
        
        filtered = df[df['species_common_name'] == species]
        
        grouped = filtered.groupby(value)[gnames].mean()
        if sort:
            grouped = grouped.sort_values(by=gnames[0], ascending=False)

        missing_values = []
        indices = []
        for unique_value in all_unique_values:
            if unique_value not in grouped.index:        
                indices.append(unique_value)
                missing_values.append({key: np.nan for key in gnames})

        if missing_values:
            grouped = pd.concat([grouped, pd.DataFrame(data=missing_values, index=indices)])
        grouped = grouped.loc[all_unique_values]

        arr = grouped.to_numpy().astype('float')

        ax[idx].imshow(arr, cmap=cmap, aspect='auto', vmin=0, vmax=vmax)
        xlabs = grouped.columns.values
        ylabs = grouped.index.values

        if idx == 0:
            ax[idx].set_yticks(range(len(ylabs)))
            ax[idx].set_yticklabels(ylabs)
        else:
            ax[idx].set_yticks([])
            ax[idx].set_yticklabels([])
        ax[idx].set_xticks(range(len(xlabs)))
        ax[idx].set_xticklabels(xlabs)
        ax[idx].set_title(species)

    plt.subplots_adjust(wspace=0.00, hspace=0.00)
    
    return fig, ax

Expression of selected genes in the Basal Ganglia#

Below we use our heatmap function to plot the gene expression as a function of a feature in our data. For the first two plots, we show the expression for all species against the Neighborhood and Group levels of the taxonomy. We observe that the expression across the species are similar as we would expect if these cell types are conserved across species. Note that a band of white denotes a Group (or other feature) that is not present in for the given species.

fig, ax = plot_species_heatmap(
    df=cell_extended_with_genes,
    gnames=aligned_gene_data.columns,
    value='Neighborhood',
    species_list=['Human', 'Macaque', 'Marmoset'],
    fig_width=15,
    fig_height=5
)
fig.suptitle('Neighborhood')
plt.show()
../_images/a988e446901cec255fb09a5d8a377a427662f8c469c7a8482c2d13e344de839e.png
fig, ax = plot_species_heatmap(
    df=cell_extended_with_genes,
    gnames=aligned_gene_data.columns,
    value='Group',
    species_list=['Human', 'Macaque', 'Marmoset'],
    fig_width=15,
    fig_height=10
)
fig.suptitle('Group', size=20)
plt.show()
../_images/ab110b0479e82b446e2afa2a49d6735712601b6e96a563e609b6ce889751d3b4.png

Now we perform the same heatmap comparison against the region of interest in the brain the cell was extracted from. Note that currently the Marmoset has no fine grained identification of brain region.

fig, ax = plot_species_heatmap(
    df=cell_extended_with_genes,
    gnames=aligned_gene_data.columns,
    species_list=['Human', 'Macaque', 'Marmoset'],
    value='region_of_interest_label',
    fig_width=15,
    fig_height=10
)
fig.suptitle('Region of Interest')
plt.show()
../_images/aad32c19afa11219b3a0e283d7adb4b833fe919d43776adc7ac80a5064a00feb.png

Expression in the UMAP#

We can also visualize the relationship between these genes and their location in the UMAP. We’ll plot the two species side by side with the lightgrey cells in each plot representing those from the other species that do not overlap with the plotted one. Overall the expressions between species agree across the UMAP.

def plot_umap(df, feature, species_list, cmap=None, fig_width=21, fig_height=10) :
    
    fig, ax = plt.subplots(1, len(species_list))
    fig.set_size_inches(fig_width, fig_height)

    all_xx = df['x']
    all_yy = df['y']
    
    for idx, species in enumerate(species_list):
        
        filtered = df[df['species_common_name'] == species]
        xx = filtered['x']
        yy = filtered['y']
        vv = filtered[feature]
        ax[idx].scatter(all_xx, all_yy, s=1.0, color='#D3D3D3', marker='.')
        
        if cmap is not None :
            ax[idx].scatter(xx, yy, s=1.0, c=vv, marker='.', cmap=cmap)
        else :
            ax[idx].scatter(xx, yy, s=1.0, color=vv, marker=".")
            
        ax[idx].axis('equal')
        ax[idx].set_xticks([])
        ax[idx].set_yticks([])
        
        ax[idx].set_title(f"{species}")
        
    plt.subplots_adjust(wspace=0.01, hspace=0.01)
    return fig, ax
for gene_name in gene_names:
    fig, ax = plot_umap(
        cell_extended_with_genes,
        feature=gene_name,
        species_list=['Human', 'Macaque', 'Marmoset'],
        cmap=plt.cm.magma_r
    )
    fig.suptitle(f'Gene Expression: {gene_name}')
    plt.show()
../_images/82a944014a6c8c7f9af054b6767fa72e586d4b332c2326507fcd2a577445c3ee.png ../_images/e5023e77ef314e2472a6609bd67fbeb82ec1d4a1fdfa0ad69e8068217a6f6145.png ../_images/d6218b83e42f394e3053b18ad9b8680fc78c2a5f6ac0e832093ef952f770b5e6.png ../_images/7ff322edfdb41e29e5befe074944d9564b89354c33b910a616cb0266614ab23f.png ../_images/6b4920b3dd40762855b34b383b6e6edbc3b8b238269b584808d24e02e36183e9.png ../_images/fa88a34b99cb60c6975aef3c49a16348215df6a517de56ea8aa0cf54af2332ae.png ../_images/66507e8e53c48fdddf4d27b8f2e19480bbea6d27284c467374a13eb3cbacdc71.png