Source code for besca.pp._filtering

import logging
import sys
import anndata
from pandas import read_csv
from numpy import sum
from besca.datasets._mito import get_mito_genes

# import helper functions from besca
from besca._helper import get_raw

[docs]def filter(
    adata,
    max_genes=None,
    min_genes=None,
    max_counts=None,
    min_counts=None,
    min_cells=None,
    max_mito=None,
    annotation_type=None,
    species="human",
):
    """Filter cell outliers based on counts, numbers of genes expressed, number of cells expressing a gene and mitochondrial gene content.

    Filtering is performed iteratively in the order: max_genes, min_genes, max_counts, min_counts,
    min_cells, max_mito.

    The Thresholds are defined as follows:
    max_genes >= n_genes
    min_genes <= n_genes
    max_counts >= n_counts
    min_counts <= n_counts
    min_cells <= n_cells
    max_mito > percent mito

    parameters
    ----------
    adata: :class:`~anndata.AnnData`
        The annotated data matrix.
    max_genes: `int` | default = None
        integer value specifying the threshold for the maximum number of genes that a cell needs to express
    min_genes: `int` | default = None
        integer value specifying the threshold for the minimum number of genes that a cell needs to express
    max_counts: `int` | default = None
        integer value specifying the maximum number of counts that a cell needs to contain
    min_counts: `int` | default = None
        integer value specifying the minimum number of counts that a cell needs to contain
    min_cells: `int` | default = None
        integer value specifying the minimum number of cells that need to express a gene for it to be included
    max_mito: `float` | default = None
        decimalvalue specifying the threshold for the maximum percentage of mitochondrial genes in a cell
    annotation_type: `SYMBOL` or `ENSEMBLE` or `None` | default = None
        string identifying the type of gene ids contained in adata.var_names, necessary for identifying
        mitochondrial genes in case percent_mito is not already included in adata.obs

    returns
    -------
    AnnData
        filters the :class:`~anndata.AnnData` object and potentially adds either `n_genes` or `n_counts` to `adata.obs`.

    Example
    -------
    >>> import besca as bc
    >>> adata = bc.datasets.simulated_pbmc3k_raw()
    >>> adata = bc.pp.filter(adata, max_counts=6500, max_genes=1900, max_mito=0.05, min_genes=600, min_counts=600, min_cells=2, annotation_type='SYMBOL')

    """
    if isinstance(adata, anndata.AnnData):

        # get original number of cells
        ncells = adata.shape[0]
        ngenes = adata.shape[1]

        logging.info(
            "started with %d total cells and %d total genes",
            ncells, ngenes
        )

        # calculate values if necessary
        if max_counts is not None or min_counts is not None:
            if adata.obs.get("n_counts") is None:
                adata.obs["n_counts"] = adata.X.sum(axis=1)
        if min_genes is not None or max_genes is not None:
            if adata.obs.get("n_genes") is None:
                adata.obs["n_genes"] = sum(adata.X > 0, axis=1)
        if min_cells is not None:
            if adata.var.get("n_cells") is None:
                adata.var["n_cells"] = sum(adata.X > 0, axis=0).T

        if max_mito is not None:
            if adata.obs.get("percent_mito") is None:
                mito_list = get_mito_genes(species, annotation_type)
                mito_genes = [
                    name for name in adata.var_names if name in mito_list
                ]  # ensembl
                # for each cell compute fraction of counts in mito genes vs. all genes
                n_counts = sum(adata.X, axis=1).A1
                n_counts[n_counts == 0] = float("inf")
                adata.obs["percent_mito"] = (
                    sum(adata[:, mito_genes].X, axis=1).A1 / n_counts
                )

        # perform actual filtering for all given parameters
        if max_genes is not None:
            curr_cells = adata.shape[0]
            adata = adata[adata.obs.get("n_genes") <= max_genes, :].copy()
            new_cells = adata.shape[0]
            logging.info(
                "removed %d cells that expressed more than %d genes",
                curr_cells - new_cells,
                max_genes
            )

        if min_genes is not None:
            curr_cells = adata.shape[0]
            adata = adata[adata.obs.get("n_genes") >= min_genes, :].copy()
            new_cells = adata.shape[0]
            logging.info(
                "removed %d cells that did not express at least %d genes",
                curr_cells - new_cells,
                min_genes
            )

        if max_counts is not None:
            curr_cells = adata.shape[0]
            adata = adata[adata.obs.get("n_counts") <= max_counts, :].copy()
            new_cells = adata.shape[0]
            logging.info(
                "removed %d cells that had more than %d counts",
                curr_cells - new_cells,
                max_counts
            )

        if min_counts is not None:
            curr_cells = adata.shape[0]
            adata = adata[adata.obs.get("n_counts") >= min_counts, :].copy()
            new_cells = adata.shape[0]
            logging.info(
                "removed %d cells that did not have at least %d counts",
                curr_cells - new_cells,
                min_counts
            )

        if min_cells is not None:
            curr_genes = adata.shape[1]
            adata = adata[:, adata.var.get("n_cells") >= min_cells].copy()
            new_genes = adata.shape[1]
            logging.info(
                "removed %d genes that were not expressed"
                "in at least %d cells",
                curr_genes - new_genes,
                min_cells,
            )

        if max_mito is not None:
            curr_cells = adata.shape[0]
            adata = adata[adata.obs.get("percent_mito") < max_mito, :].copy()
            new_cells = adata.shape[0]
            logging.info(
                "removed %d cells that expressed"
                "%d%% mitochondrial genes or more",
                curr_cells - new_cells,
                max_mito * 100
            )

        ncells_final = adata.shape[0]
        ngenes_final = adata.shape[1]

        logging.info(
            "finished with %d total cells and total %d genes",
            ncells_final,
            ngenes_final
        )

        return adata

    else:
        logging.error("please pass an AnnData object as data")
        sys.exit(1)


[docs]def filter_gene_list(adata, filepath, use_raw=True, use_genes="SYMBOL"):
    """Function to remove all genes specified in a gene list read from file.

    This function removes all genes in the dataset from a given list of genes (for example mito genes or ribosomal genes).
    Note that the input file consists of two columns (ENSEMBL gene id and gene symbol).

    Parameters
    ----------
    adata: `AnnData`
        AnnData object
    filepath: `str`
        File path as string point to gene list
    use_raw: `bool` | default = True
        Boolian indicator if the function should return the filtered raw object or not.
    use_genes: `SYMBOL` or `ENSEMBL` | default = SYMBOL
        String defining whether ENSEMBL id's or gene symbols are used in the adata.var_names (defines which column of input gene list is read)

    Returns
    -------
    AnnData
        Returns the filtered AnnData object.

    Example
    -------
    >>> import besca as bc
    >>> import os
    >>> adata = bc.datasets.simulated_pbmc3k_raw()
    >>> # TO COMPLETE reference_mito_file = 'path2file.tsv'
    >>> # adata = bc.pp.filter_gene_list(adata, file_path=reference_mito_file, use_genes='SYMBOL')
    >>> # adata.n_vars

    """
    # generate copy of adata object
    if use_raw:
        if adata.raw is None:
            logging.warning(
                "adata does not contain .raw filtering on regular adata object"
            )
            adata = adata.copy()
        else:
            adata = get_raw(adata)
    else:
        adata = adata.copy()

    if use_genes == "SYMBOL":
        gene_list = list(
            read_csv(filepath, header=None, sep="\t")[1]
        )  # ENS_GENE_ID  GENE_SYMBOL (2 cols)
    elif use_genes == "ENSEMBL":
        gene_list = list(
            read_csv(filepath, header=None, sep="\t")[0]
        )  # ENS_GENE_ID  GENE_SYMBOL (2 cols)
    else:
        sys.exit("Please supply either SYMBOL or ENSEMBL ids")
    genes = [i for i in adata.var_names if i not in gene_list]
    # remove all the genes from the list
    if len(genes) > 0:
        gene_indexes = [adata.var_names.tolist().index(x) for x in genes]
        adata = adata[:, gene_indexes]
    else:
        logging.warning(
            "None of the genes from input list found in data set. Please ensure you have correctly specified use_genes to match the type of genes saved in adata.var_names."
        )
    return adata