import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import os
import pkg_resources

# File paths for Segerstolpe dataset
path_adata = 'segerstolpe.h5ad'
uri_adata = "https://zenodo.org/record/3928276/files/standard_workflow_besca2_0_annotated.h5ad?download=1"
path_annot = './segerstolpe_annot.csv' # output filename of exported annotations (see next cells)
annot_col = 'dblabel' # adata column containing annotations

# Read anndata objects
# Downloads the datasets from the provided URI
adata = sc.read(path_adata, backup_url=uri_adata)

# Conversion to R changes cell annotations to factors 
# -> export cell type annotations to be later rewritten into the R obj
# here we are selecting ['dblabel'] as the annotation column. Modify this for the desired annotation column
annot = adata.obs[annot_col]
annot.to_csv(path_annot, index=False, header=False)

# For SCDC and MuSiC we want expressions stored in AnnData.raw us they contain all sequenced genes, 
# not just highly variable genes filtered further downstream of the BESCA workflow
obs = adata.obs
var = adata.raw.var
uns = adata.uns
raw = adata.raw
# raw.X contains CP10k log scaled data. Thus, we need to linearize using raw.X.expm1()
adata_raw = anndata.AnnData(raw.X.expm1(), obs=obs, var=var, uns=uns, raw=raw)

Fix segerstolpe index - needs to contain patient id ¶

This is required by the BisqueRNA::SeuratToExpressionSet() R package: https://rdrr.io/cran/BisqueRNA/man/SeuratToExpressionSet.html \ The sample index has to contain subscriptable individual patient IDs

Example correct patient ID: ERR1630619_T2D1, where ERR1630619 is the cell UMI and T2D1 is the unique patient ID we want.

We will then specify this in the R script that calls the BisqueRNA::SeuratToExpressionSet() method \ BisqueRNA::SeuratToExpressionSet(seurat.object, delimiter='_', position='2', version = "v3")

delimiter = Character to split cell names with to find individual ID. \ position = Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1

idx = adata_raw.obs.index
idx_new = []
for i, val in enumerate(idx):
    out = val + '_' +adata_raw.obs['Sample Characteristic[individual]'][i]
    idx_new.append(out)
    
adata_raw.obs.index = idx_new
adata_raw.obs.head()

Convert to SingleCellExperiment R Object¶

saves anndata as SCE object. Postprocess in R

based on tutorial here: https://github.com/LuckyMD/Code_snippets/blob/master/Seurat_to_anndata.ipynb

import anndata2ri
anndata2ri.activate()

%load_ext rpy2.ipython

%%R -i adata_raw
saveRDS(adata_raw, './adata_raw_to_sce.RDS')

Next: postprocess in R to convert SCE --> Eset¶

Use the sce_to_eset.R script generated in the next cell.

Edit the last two lines of the script for the correct input/output file paths:\ seurat <- sce_to_seurat(sce_path='./segerstolpe_raw_sce.RDS', sc_anno_path='./segerstolpe_annot.csv', filename=NULL)\ sce_path = path to SCE file generated above\ sc_anno_path = path to the .csv file containing cell annotations generated in the section above \ \ eset <- seurat_to_eset(seurat, delim='_', idx=2, filename='./segerstolpe_raw_eset.RDS')\ delim = Character to split cell names with to find individual ID. \ idx = Integer indicating 1-indexed position of individual ID after splitting cell name with delimiter. R indexing starts from 1
Set the correct patient ID indexing as explained in the section above
Run from cmd line: env R_MAX_VSIZE=100Gb Rscript sce_to_eset.R

r_code = """library(Seurat)
library(BisqueRNA)

sce_to_seurat <- function(sce_path, sc_anno_path, filename='eset.RDS'){
    sce <- readRDS(sce_path)
    sc_anno <- read.csv(sc_anno_path, header=FALSE)

    seurat <- as.Seurat(sce, counts=NULL, data='X')

# TODO add celltype_dream as param
    seurat@meta.data$celltype_dream <- sc_anno$V1
    Idents(seurat)<-"celltype_dream"
    seurat@assays$RNA@counts<-seurat@assays$RNA@data
    if(!is.null(filename)){
        saveRDS(seurat, file=filename)
    }
    return(seurat)

}

seurat_to_eset <- function(seurat, delim='-', idx=2, filename='seurat.RDS'){
    # Idents(liver_h5ad) %>% head() # to check sample names
    out.eset <- BisqueRNA::SeuratToExpressionSet(seurat, delim, idx, version = 'v3')
    # fix GeneID
    out.eset@featureData@data$GeneID <- featureNames(out.eset)
    #assign cluster to just characters
    out.eset@phenoData@data$cluster<-as.character(out.eset@phenoData@data$cellType)
    #update vardata
    varMetadata(out.eset)<-data.frame(labelDescription=colnames(out.eset@phenoData@data),stringsAsFactors=FALSE)

    out.eset@featureData@varMetadata<-data.frame(labelDescription="GeneID",stringsAsFactors=FALSE)

    rownames(out.eset@featureData@varMetadata)<-"GeneID"

    saveRDS(out.eset, file=filename)
    return(out.eset)
}


seurat <- sce_to_seurat(sce_path='./segerstolp_raw_exp_sce.RDS', sc_anno_path='./segerstolpe_annot.csv', filename=NULL)
eset <- seurat_to_eset(seurat, delim='_', idx=1, filename='./segerstolpe_raw_exp_eset.RDS')
"""

if not os.path.isfile('sce_to_eset.R'):
    with open('sce_to_eset.R','w') as f:
        f.write(r_code)