Source code for neuralee.dataset.cite_seq

import numpy as np
import pandas as pd
import os
from .dataset import GeneExpressionDataset

available_datasets = {
    "cbmc": "CBMC_8K_13AB_10X",
    "pbmc": "PBMC_vs_flow_10X",
    "cd8": "CD8_merged"
}


[docs]class CiteSeqDataset(GeneExpressionDataset): def __init__(self, name='cbmc', save_path='data/citeSeq/'): self.save_path = os.path.join(save_path, name) s = available_datasets[name] url_rna, url_adt, url_adt_clr = ( "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/" "GSE100866_%s-RNA_umi.csv.gz" % s, "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/" "GSE100866_%s-ADT_umi.csv.gz" % s, "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/" "GSE100866_%s-ADT_clr-transformed.csv.gz" % s ) self.urls = [url_rna, url_adt, url_adt_clr] # Their centered log ratio transformation for ADT counts is different # from the standard clr transformation : # they explain they add pseudocounts (for 0 values), # but do not explicit the actual transformation, # which doesn't seem to be simply be adding count 1 to all entries, # or only 0 entries self.download_name_rna = '%s_rna.csv.gz' % name self.download_name_adt = '%s_adt.csv.gz' % name self.download_name_adt_centered = '%s_adt_centered.csv.gz' % name self.download_names = ( self.download_name_rna, self.download_name_adt, self.download_name_adt_centered ) expression_data = self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix(expression_data) )
[docs] def preprocess(self): print("Preprocessing data") self.expression = expression = pd.read_csv( os.path.join(self.save_path, self.download_name_rna), index_col=0, compression='gzip').T self.adt = adt = pd.read_csv( os.path.join(self.save_path, self.download_name_adt), index_col=0, compression='gzip') self.adt_expression = adt.T.values self.protein_markers = np.array(adt.index).astype(np.str) self.adt_centered = adt_centered = pd.read_csv( os.path.join(self.save_path, self.download_name_adt_centered), index_col=0, compression='gzip') self.adt_expression_clr = adt_centered.T.values f = self.protein_markers == np.array(adt_centered.index).astype(np.str) assert f.all() gene_symbols = np.array(expression.columns, dtype=str) human_filter = np.array( [name.startswith('HUMAN') for name in gene_symbols], dtype=np.bool) print("Selecting only HUMAN genes (%d / %d)" % (human_filter.sum(), len(human_filter))) expression_data = expression.values[:, human_filter] gene_symbols = gene_symbols[human_filter] self.gene_symbols = np.char.upper( np.array( [name.split('_')[-1] if '_' in name else name for name in gene_symbols], dtype=np.str) ) print("Finish preprocessing data") return expression_data
[docs]class CbmcDataset(CiteSeqDataset): r""" Loads cbmc dataset. This dataset that includes 8,617 cord blood mononuclear cells profiled using 10x along with for each cell 13 well-characterized mononuclear antibodies. We kept the top 600 genes by variance. :param save_path: Save path of raw data file. Examples:: gene_dataset = CbmcDataset() """ def __init__(self, save_path='data/citeSeq/'): super().__init__(name="cbmc", save_path=save_path)