Source code for besca.tl.rc._reclustering

from scanpy.preprocessing import filter_genes_dispersion, log1p, regress_out, neighbors
from scanpy.preprocessing import highly_variable_genes as sc_highly_variable_genes
from scanpy.preprocessing import scale as sc_scale
from scanpy.tools import umap, louvain, leiden
from scanpy.tools import pca as sc_pca
from scanpy.plotting import filter_genes_dispersion as plot_filter
from besca._helper import subset_adata as _subset_adata
import sys
import logging

[docs]def recluster( adata, celltype, celltype_label="leiden", min_mean=0.0125, max_mean=4, min_disp=0.5, resolution=1.0, regress_out_key=None, random_seed=0, show_plot_filter=False, method="leiden", batch_key=None, n_shared=2, ): """Perform subclustering on specific celltype to identify subclusters. Extract all cells that belong to the pre-labeled celltype into a new data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly variable genes are selected and a new clustering is performed. The function returns the adata subset with the new clustering annotation. This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the clusters that are to be selected for reclustering as strings or tuple of strings to the parameter celltype. Parameters ---------- adata: the complete AnnData object of the Dataset. celltype: `str` or (`str`) string identifying the cluster which is to be filtered out, if more than one is to be selected please pass them as a tuple not as a list! celltype_label: `str` | default = 'leiden' string identifying which column in adata.obs will be matching with the celltype argument. min_mean: `float` | default = 0.0125 the minimum gene expression a gene must have to be considered highly variable max_mean: `float` | default = 4 the maximum gene expression a gene can have to be considered highly variable min_disp: `float` | default = 0.5 the minimum dispersion a gene must have to be considered highly variable regress_out_key: `list of str` | default = None A list of string identifiers of the adata.obs columns that should be regressed out before performing clustering. If None then no regress_out is calculated. random_seed: `int` | default = 0 the random seed that is used to produce reproducible PCA, clustering and UMAP results show_plot_filter: `bool` | default = False boolian value indicating if a plot showing the filtering results for highly variable gene detection should be displayed or not method: `str` | default = 'leiden' clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden batch_key: `str` | default = None Specify a batch key if the HVG calculation should be done per batch n_share: `int` | default = 3 Divide the nr. of batched by this nr. to get the shared HVGs considered (e.g. >=1/3 of samples) Returns ------- AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster, and UMAP coordinates. Examples -------- For a more detailed example of the entire reclustering process please refer to the code examples. >>> import besca as bc >>> import scanpy as sc >>> adata = bc.datasets.simulated_pbmc3k_processed() >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3) >>> sc.pl.umap(adata_subset, color = ['leiden', 'Gene_4', 'Gene_5', 'Gene_6', 'Gene_10', 'Gene_12', 'Gene_20']) """ if not method in ["leiden", "louvain"]: raise ValueError("method argument should be leiden or louvain") if type(celltype) == str: cluster_subset = _subset_adata(adata, adata.obs.get(celltype_label) == celltype) elif type(celltype) == tuple: filter = adata.obs.get(celltype_label) == "NONE" for i in range(len(celltype)): filter = filter | (adata.obs.get(celltype_label) == celltype[i]) cluster_subset = _subset_adata(adata, filter) else: sys.exit("specify cluster input as a string or tuple") cluster_subset.raw = cluster_subset # identify highly variable genes sc_highly_variable_genes( cluster_subset, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp, inplace=True, batch_key=batch_key, ) if batch_key != None: hvglist = cluster_subset.var["highly_variable"].copy() hvglist.loc[ cluster_subset.var["highly_variable_nbatches"] >= len(set(cluster_subset.obs[batch_key])) / n_shared, ] = True cluster_subset.var["highly_variable"] = hvglist.copy() if show_plot_filter: pl_highly_variable_genes(cluster_subset, show=True) logging.info( "In total", str(sum(cluster_subset.var.highly_variable)), "highly variable genes selected within cluster", ) # apply filter cluster_subset = _subset_adata( cluster_subset, cluster_subset.var.highly_variable, axis=1, raw=False ) # perform further processing # log1p(cluster_subset) # data already logged if regress_out_key is not None: regress_out(cluster_subset, keys=regress_out_key) sc_scale(cluster_subset, max_value=10) sc_pca( cluster_subset, random_state=random_seed, svd_solver="arpack" ) # using `svd_solver='arpack' ensures that the PCA leads to reproducible results neighbors(cluster_subset, n_neighbors=10, random_state=random_seed) umap(cluster_subset, random_state=random_seed) if method == "louvain": louvain(cluster_subset, resolution=resolution, random_state=random_seed) if method == "leiden": leiden(cluster_subset, resolution=resolution, random_state=random_seed) return cluster_subset
[docs]def annotate_new_cellnames( adata, cluster_subset, names, new_label="celltype", method="leiden" ): """annotate new cellnames to each of the subclusters identified by running recluster. Give each subcluster a new celltype identifier. Can only be run on an AnnData subset that has already been reclustered, e.g. using the recluster function also available in this package. The list of provided names must be of equal length to the number of subclusters in the AnnData subset. The order must be the same as the order of the clusters identifiers (integer expected). Parameters ---------- adata: complete AnnData object from which the subset was extracted. Must contain adata.obs.celltype. cluster_subset: AnnData subset on which a reclustering has been run. Must contain adata.obs.celltype. names: `[str]` list of strings containing the new celltype annotation. Order and Length must mach the lexicographic order of the louvain clusters in the AnnData susbet. celltype_label: new_label: `str` | default = 'celltype' string specifying under which label in adata.obs the new annotation should be saved (it will overwrite existing annotations under this name) method: `str` | default = 'leiden' string indicating the method used for clustering. This string will be used to retrieve the cluster numbers in adata.obs and in the cluster_subset. (assuming the same obs column name) returns ------- Complete AnnData object with the new celltype annotations in the adata.obs.new_label column for the subclusters. Examples -------- Please refer to the code examples section of the documentation for a complete example of the reclustering process. """ clusters = [ int(x) for x in cluster_subset.obs.get(method).value_counts().index.tolist() ] clusters.sort() clusters = [str(x) for x in clusters] if len(clusters) != len(names): logging.info("need to supply as many cluster names as there are clusters") logging.info("these are the clusters that need to be annotated:") logging.info(str(clusters)) sys.exit( "Specified " + str(len(names)) + " new labels for a total of " + str(len(clusters)) + " clusters. Numbers should match! No changes were made.\n" ) else: if adata.obs.get(new_label) is None: # if the column 'new_label' did not previously exist then create it adata.obs[new_label] = "not_labeled" else: logging.info( "NOTE: overwriting labels for the selected cells saved in adata.obs." + new_label + " with the new labels" ) new_annotation = cluster_subset.obs.get(method).to_frame(name=new_label).copy() for i in range(0, len(clusters)): new_annotation[new_label].replace(clusters[i], names[i], inplace=True) # update anndata object with the new annotation adata.obs.update(new_annotation) return None