Source code for besca.pp._filtering

import logging
import sys
import anndata
from pandas import read_csv
from numpy import sum
from besca.datasets._mito import get_mito_genes

# import helper functions from besca
from besca._helper import get_raw

[docs]def filter( adata, max_genes=None, min_genes=None, max_counts=None, min_counts=None, min_cells=None, max_mito=None, annotation_type=None, species="human", ): """Filter cell outliers based on counts, numbers of genes expressed, number of cells expressing a gene and mitochondrial gene content. Filtering is performed iteratively in the order: max_genes, min_genes, max_counts, min_counts, min_cells, max_mito. The Thresholds are defined as follows: max_genes >= n_genes min_genes <= n_genes max_counts >= n_counts min_counts <= n_counts min_cells <= n_cells max_mito > percent mito parameters ---------- adata: :class:`~anndata.AnnData` The annotated data matrix. max_genes: `int` | default = None integer value specifying the threshold for the maximum number of genes that a cell needs to express min_genes: `int` | default = None integer value specifying the threshold for the minimum number of genes that a cell needs to express max_counts: `int` | default = None integer value specifying the maximum number of counts that a cell needs to contain min_counts: `int` | default = None integer value specifying the minimum number of counts that a cell needs to contain min_cells: `int` | default = None integer value specifying the minimum number of cells that need to express a gene for it to be included max_mito: `float` | default = None decimalvalue specifying the threshold for the maximum percentage of mitochondrial genes in a cell annotation_type: `SYMBOL` or `ENSEMBLE` or `None` | default = None string identifying the type of gene ids contained in adata.var_names, necessary for identifying mitochondrial genes in case percent_mito is not already included in adata.obs returns ------- AnnData filters the :class:`~anndata.AnnData` object and potentially adds either `n_genes` or `n_counts` to `adata.obs`. Example ------- >>> import besca as bc >>> adata = bc.datasets.simulated_pbmc3k_raw() >>> adata = bc.pp.filter(adata, max_counts=6500, max_genes=1900, max_mito=0.05, min_genes=600, min_counts=600, min_cells=2, annotation_type='SYMBOL') """ if isinstance(adata, anndata.AnnData): # get original number of cells ncells = adata.shape[0] ngenes = adata.shape[1] logging.info( "started with %d total cells and %d total genes", ncells, ngenes ) # calculate values if necessary if max_counts is not None or min_counts is not None: if adata.obs.get("n_counts") is None: adata.obs["n_counts"] = adata.X.sum(axis=1) if min_genes is not None or max_genes is not None: if adata.obs.get("n_genes") is None: adata.obs["n_genes"] = sum(adata.X > 0, axis=1) if min_cells is not None: if adata.var.get("n_cells") is None: adata.var["n_cells"] = sum(adata.X > 0, axis=0).T if max_mito is not None: if adata.obs.get("percent_mito") is None: mito_list = get_mito_genes(species, annotation_type) mito_genes = [ name for name in adata.var_names if name in mito_list ] # ensembl # for each cell compute fraction of counts in mito genes vs. all genes n_counts = sum(adata.X, axis=1).A1 n_counts[n_counts == 0] = float("inf") adata.obs["percent_mito"] = ( sum(adata[:, mito_genes].X, axis=1).A1 / n_counts ) # perform actual filtering for all given parameters if max_genes is not None: curr_cells = adata.shape[0] adata = adata[adata.obs.get("n_genes") <= max_genes, :].copy() new_cells = adata.shape[0] logging.info( "removed %d cells that expressed more than %d genes", curr_cells - new_cells, max_genes ) if min_genes is not None: curr_cells = adata.shape[0] adata = adata[adata.obs.get("n_genes") >= min_genes, :].copy() new_cells = adata.shape[0] logging.info( "removed %d cells that did not express at least %d genes", curr_cells - new_cells, min_genes ) if max_counts is not None: curr_cells = adata.shape[0] adata = adata[adata.obs.get("n_counts") <= max_counts, :].copy() new_cells = adata.shape[0] logging.info( "removed %d cells that had more than %d counts", curr_cells - new_cells, max_counts ) if min_counts is not None: curr_cells = adata.shape[0] adata = adata[adata.obs.get("n_counts") >= min_counts, :].copy() new_cells = adata.shape[0] logging.info( "removed %d cells that did not have at least %d counts", curr_cells - new_cells, min_counts ) if min_cells is not None: curr_genes = adata.shape[1] adata = adata[:, adata.var.get("n_cells") >= min_cells].copy() new_genes = adata.shape[1] logging.info( "removed %d genes that were not expressed" "in at least %d cells", curr_genes - new_genes, min_cells, ) if max_mito is not None: curr_cells = adata.shape[0] adata = adata[adata.obs.get("percent_mito") < max_mito, :].copy() new_cells = adata.shape[0] logging.info( "removed %d cells that expressed" "%d%% mitochondrial genes or more", curr_cells - new_cells, max_mito * 100 ) ncells_final = adata.shape[0] ngenes_final = adata.shape[1] logging.info( "finished with %d total cells and total %d genes", ncells_final, ngenes_final ) return adata else: logging.error("please pass an AnnData object as data") sys.exit(1)
[docs]def filter_gene_list(adata, filepath, use_raw=True, use_genes="SYMBOL"): """Function to remove all genes specified in a gene list read from file. This function removes all genes in the dataset from a given list of genes (for example mito genes or ribosomal genes). Note that the input file consists of two columns (ENSEMBL gene id and gene symbol). Parameters ---------- adata: `AnnData` AnnData object filepath: `str` File path as string point to gene list use_raw: `bool` | default = True Boolian indicator if the function should return the filtered raw object or not. use_genes: `SYMBOL` or `ENSEMBL` | default = SYMBOL String defining whether ENSEMBL id's or gene symbols are used in the adata.var_names (defines which column of input gene list is read) Returns ------- AnnData Returns the filtered AnnData object. Example ------- >>> import besca as bc >>> import os >>> adata = bc.datasets.simulated_pbmc3k_raw() >>> # TO COMPLETE reference_mito_file = 'path2file.tsv' >>> # adata = bc.pp.filter_gene_list(adata, file_path=reference_mito_file, use_genes='SYMBOL') >>> # adata.n_vars """ # generate copy of adata object if use_raw: if adata.raw is None: logging.warning( "adata does not contain .raw filtering on regular adata object" ) adata = adata.copy() else: adata = get_raw(adata) else: adata = adata.copy() if use_genes == "SYMBOL": gene_list = list( read_csv(filepath, header=None, sep="\t")[1] ) # ENS_GENE_ID GENE_SYMBOL (2 cols) elif use_genes == "ENSEMBL": gene_list = list( read_csv(filepath, header=None, sep="\t")[0] ) # ENS_GENE_ID GENE_SYMBOL (2 cols) else: sys.exit("Please supply either SYMBOL or ENSEMBL ids") genes = [i for i in adata.var_names if i not in gene_list] # remove all the genes from the list if len(genes) > 0: gene_indexes = [adata.var_names.tolist().index(x) for x in genes] adata = adata[:, gene_indexes] else: logging.warning( "None of the genes from input list found in data set. Please ensure you have correctly specified use_genes to match the type of genes saved in adata.var_names." ) return adata