import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import os
import pkg_resources
# File paths for Segerstolpe dataset
path_adata = 'segerstolpe.h5ad'
uri_adata = "https://zenodo.org/record/3928276/files/standard_workflow_besca2_0_annotated.h5ad?download=1"
path_annot = './segerstolpe_annot.csv' # output filename of exported annotations (see next cells)
annot_col = 'dblabel' # adata column containing annotations
# Read anndata objects
# Downloads the datasets from the provided URI
adata = sc.read(path_adata, backup_url=uri_adata)
# Conversion to R changes cell annotations to factors
# -> export cell type annotations to be later rewritten into the R obj
# here we are selecting ['dblabel'] as the annotation column. Modify this for the desired annotation column
annot = adata.obs[annot_col]
annot.to_csv(path_annot, index=False, header=False)
# For SCDC and MuSiC we want expressions stored in AnnData.raw us they contain all sequenced genes,
# not just highly variable genes filtered further downstream of the BESCA workflow
obs = adata.obs
var = adata.raw.var
uns = adata.uns
raw = adata.raw
# raw.X contains CP10k log scaled data. Thus, we need to linearize using raw.X.expm1()
adata_raw = anndata.AnnData(raw.X.expm1(), obs=obs, var=var, uns=uns, raw=raw)
This is required by the BisqueRNA::SeuratToExpressionSet() R package: https://rdrr.io/cran/BisqueRNA/man/SeuratToExpressionSet.html \ The sample index has to contain subscriptable individual patient IDs
Example correct patient ID: ERR1630619_T2D1, where ERR1630619 is the cell UMI and T2D1 is the unique patient ID we want.
We will then specify this in the R script that calls the BisqueRNA::SeuratToExpressionSet()
method \
BisqueRNA::SeuratToExpressionSet(seurat.object, delimiter='_', position='2', version = "v3")
delimiter
= Character to split cell names with to find individual ID. \
position
= Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1
idx = adata_raw.obs.index
idx_new = []
for i, val in enumerate(idx):
out = val + '_' +adata_raw.obs['Sample Characteristic[individual]'][i]
idx_new.append(out)
adata_raw.obs.index = idx_new
adata_raw.obs.head()
saves anndata as SCE object. Postprocess in R
based on tutorial here: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython
%%R -i adata_raw
saveRDS(adata_raw, './adata_raw_to_sce.RDS')
Use the sce_to_eset.R
script generated in the next cell.
seurat <- sce_to_seurat(sce_path='./segerstolpe_raw_sce.RDS', sc_anno_path='./segerstolpe_annot.csv', filename=NULL)
\
sce_path
= path to SCE file generated above\
sc_anno_path
= path to the .csv file containing cell annotations generated in the section above
\
\
eset <- seurat_to_eset(seurat, delim='_', idx=2, filename='./segerstolpe_raw_eset.RDS')
\
delim
= Character to split cell names with to find individual ID. \
idx
= Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1env R_MAX_VSIZE=100Gb Rscript sce_to_eset.R
r_code = """library(Seurat)
library(BisqueRNA)
sce_to_seurat <- function(sce_path, sc_anno_path, filename='eset.RDS'){
sce <- readRDS(sce_path)
sc_anno <- read.csv(sc_anno_path, header=FALSE)
seurat <- as.Seurat(sce, counts=NULL, data='X')
# TODO add celltype_dream as param
seurat@meta.data$celltype_dream <- sc_anno$V1
Idents(seurat)<-"celltype_dream"
seurat@assays$RNA@counts<-seurat@assays$RNA@data
if(!is.null(filename)){
saveRDS(seurat, file=filename)
}
return(seurat)
}
seurat_to_eset <- function(seurat, delim='-', idx=2, filename='seurat.RDS'){
# Idents(liver_h5ad) %>% head() # to check sample names
out.eset <- BisqueRNA::SeuratToExpressionSet(seurat, delim, idx, version = 'v3')
# fix GeneID
out.eset@featureData@data$GeneID <- featureNames(out.eset)
#assign cluster to just characters
out.eset@phenoData@data$cluster<-as.character(out.eset@phenoData@data$cellType)
#update vardata
varMetadata(out.eset)<-data.frame(labelDescription=colnames(out.eset@phenoData@data),stringsAsFactors=FALSE)
out.eset@featureData@varMetadata<-data.frame(labelDescription="GeneID",stringsAsFactors=FALSE)
rownames(out.eset@featureData@varMetadata)<-"GeneID"
saveRDS(out.eset, file=filename)
return(out.eset)
}
seurat <- sce_to_seurat(sce_path='./segerstolp_raw_exp_sce.RDS', sc_anno_path='./segerstolpe_annot.csv', filename=NULL)
eset <- seurat_to_eset(seurat, delim='_', idx=1, filename='./segerstolpe_raw_exp_eset.RDS')
"""
if not os.path.isfile('sce_to_eset.R'):
with open('sce_to_eset.R','w') as f:
f.write(r_code)