Tutorial 3. Use SEDR to do batch integration.

Here we use 3 sections from DLPFC data to show the ability of SEDR to integrate batches for the same tissue with the same techniques.

[1]:
import os
import torch
import numpy as np
import pandas as pd
import scanpy as sc

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
from tqdm import tqdm
[2]:
import SEDR
[3]:
data_root = Path('../data/DLPFC/')

proj_list = [
    '151673', '151674', '151675'
]

Combining datasets

Input of SEDR includes an AnnData object that contains the spatial transcriptomics data and a graph dictionary that contains the neighborhood graph. When combining two datasets, the AnnData objects are directly concatenated. For neighborhood graphs, we follow the following algorithm.
Let \(A^k\) and \(Z_f^k\) denote the adjacency matrix and deep gene representation of spatial omics k, we then create a block-diagonal adjacency matrix \(A^k\) and concatenate the deep gene representation in the spot dimension, as:
image0
where K is the number of spatial omics.
[4]:
for proj_name in tqdm(proj_list):
    adata_tmp = sc.read_visium(data_root / proj_name)
    adata_tmp.var_names_make_unique()

    adata_tmp.obs['batch_name'] = proj_name


    ##### Load layer_guess label, if have
    df_label = pd.read_csv(data_root / proj_name / 'metadata.tsv', sep='\t')
    adata_tmp.obs['layer_guess'] = df_label['layer_guess']
    adata_tmp= adata_tmp[~pd.isnull(adata_tmp.obs['layer_guess'])]

    graph_dict_tmp = SEDR.graph_construction(adata_tmp, 12)

    if proj_name == proj_list[0]:
        adata = adata_tmp
        graph_dict = graph_dict_tmp
        name = proj_name
        adata.obs['proj_name'] = proj_name
    else:
        var_names = adata.var_names.intersection(adata_tmp.var_names)
        adata = adata[:, var_names]
        adata_tmp = adata_tmp[:, var_names]
        adata_tmp.obs['proj_name'] = proj_name

        adata = adata.concatenate(adata_tmp)
        graph_dict = SEDR.combine_graph_dict(graph_dict, graph_dict_tmp)
        name = name + '_' + proj_name

100%|██████████| 3/3 [00:06<00:00,  2.10s/it]

Preprocessing

[5]:
adata.layers['count'] = adata.X.toarray()
sc.pp.filter_genes(adata, min_cells=50)
sc.pp.filter_genes(adata, min_counts=10)
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", layer='count', n_top_genes=2000)
adata = adata[:, adata.var['highly_variable'] == True]
sc.pp.scale(adata)

from sklearn.decomposition import PCA  # sklearn PCA is used because PCA in scanpy is not stable.
adata_X = PCA(n_components=200, random_state=42).fit_transform(adata.X)
adata.obsm['X_pca'] = adata_X

Training SEDR

[6]:
sedr_net = SEDR.Sedr(adata.obsm['X_pca'], graph_dict, mode='clustering', device='cuda:3')
using_dec = False
if using_dec:
    sedr_net.train_with_dec()
else:
    sedr_net.train_without_dec()
sedr_feat, _, _, _ = sedr_net.process()
adata.obsm['SEDR'] = sedr_feat
100%|██████████| 200/200 [00:09<00:00, 21.43it/s]

use harmony to calculate revised PCs

[7]:
import harmonypy as hm

meta_data = adata.obs[['batch']]

data_mat = adata.obsm['SEDR']
vars_use = ['batch']
ho = hm.run_harmony(data_mat, meta_data, vars_use)

res = pd.DataFrame(ho.Z_corr).T
res_df = pd.DataFrame(data=res.values, columns=['X{}'.format(i+1) for i in range(res.shape[1])], index=adata.obs.index)
adata.obsm[f'SEDR.Harmony'] = res_df
2024-11-06 14:34:39,484 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-11-06 14:34:41,350 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-11-06 14:34:41,386 - harmonypy - INFO - Iteration 1 of 10
2024-11-06 14:34:43,146 - harmonypy - INFO - Iteration 2 of 10
2024-11-06 14:34:44,927 - harmonypy - INFO - Iteration 3 of 10
2024-11-06 14:34:46,724 - harmonypy - INFO - Iteration 4 of 10
2024-11-06 14:34:48,567 - harmonypy - INFO - Iteration 5 of 10
2024-11-06 14:34:50,444 - harmonypy - INFO - Iteration 6 of 10
2024-11-06 14:34:51,926 - harmonypy - INFO - Converged after 6 iterations

Visualizing

UMAP

[8]:
sc.pp.neighbors(adata, use_rep='SEDR.Harmony', metric='cosine')
sc.tl.umap(adata)
[9]:
sc.pl.umap(adata, color=['layer_guess', 'batch_name'], show=False)
[9]:
[<Axes: title={'center': 'layer_guess'}, xlabel='UMAP1', ylabel='UMAP2'>,
 <Axes: title={'center': 'batch_name'}, xlabel='UMAP1', ylabel='UMAP2'>]
_images/Tutorial3_Batch_integration_15_1.png

LISI score

[10]:
ILISI = hm.compute_lisi(adata.obsm['SEDR.Harmony'], adata.obs[['batch']], label_colnames=['batch'])[:, 0]
CLISI = hm.compute_lisi(adata.obsm['SEDR.Harmony'], adata.obs[['layer_guess']], label_colnames=['layer_guess'])[:, 0]
[11]:
df_ILISI = pd.DataFrame({
    'method': 'SEDR',
    'value': ILISI,
    'type': ['ILISI']*len(ILISI)
})


df_CLISI = pd.DataFrame({
    'method': 'SEDR',
    'value': CLISI,
    'type': ['CLISI']*len(CLISI)
})

fig, axes = plt.subplots(1,2,figsize=(4, 5))
sns.boxplot(data=df_ILISI, x='method', y='value', ax=axes[0])
sns.boxplot(data=df_CLISI, x='method', y='value', ax=axes[1])
axes[0].set_ylim(1, 3)
axes[1].set_ylim(1, 7)
axes[0].set_title('iLISI')
axes[1].set_title('cLISI')

plt.tight_layout()
_images/Tutorial3_Batch_integration_18_0.png
[ ]:

[ ]:

[ ]:

[ ]: