Source code for neuralee.dataset.dataset10X

# On the 10X website:
# The main categories (Cell Ranger 1.1.0 / Cell Ranger 2.1.0 / ...)
# have same access suffix for each of their dataset.
# For dataset name (eg. 'pbmc8k', 'pbmc4k', ect...) their are two available
# specifications, either filtered or raw data
import os
import pickle
import tarfile

import numpy as np
import pandas as pd
from scipy import io
from scipy.sparse import csr_matrix

from .dataset import GeneExpressionDataset

available_datasets = {"1.1.0":
                      ["frozen_pbmc_donor_a",
                       "frozen_pbmc_donor_b",
                       "frozen_pbmc_donor_c",
                       "fresh_68k_pbmc_donor_a",
                       "cd14_monocytes",
                       "b_cells",
                       "cd34",
                       "cd56_nk",
                       "cd4_t_helper",
                       "regulatory_t",
                       "naive_t",
                       "memory_t",
                       "cytotoxic_t",
                       "naive_cytotoxic"
                       ],
                      "2.1.0":
                      ["pbmc8k",
                       "pbmc4k",
                       "t_3k",
                       "t_4k",
                       "neuron_9k"]}

to_groups = dict([(dataset_name, group)
                  for group, list_datasets in available_datasets.items()
                  for dataset_name in list_datasets])
available_specification = ['filtered', 'raw']


[docs]class Dataset10X(GeneExpressionDataset): r""" Loads a file from `10x`_ website. :param filename: Name of the dataset file. :param save_path: Save path of the dataset. :param type: Either `filtered` data or `raw` data. :param subset_genes: List of genes for subsampling. :param dense: Whether to load as dense or sparse. :param remote: Whether the 10X dataset is to be downloaded from the website or whether it is a local dataset, if remote is False then os.path.join(save_path, filename) must be the path to the directory that contains matrix.mtx and genes.tsv files Examples:: tenX_dataset = Dataset10X("neuron_9k") .. _10x: http://cf.10xgenomics.com/ """ def __init__(self, filename, save_path='data/', type='filtered', dense=False, remote=True, genecol=0): self.remote = remote self.save_path = save_path self.genecol = genecol if self.remote: group = to_groups[filename] self.url = ("http://cf.10xgenomics.com/samples/cell-exp" "/%s/%s/%s_%s_gene_bc_matrices.tar.gz" % (group, filename, filename, type)) self.save_path = os.path.join(save_path, '10X/%s/' % filename) self.save_name = '%s_gene_bc_matrices' % type self.download_name = self.save_name + '.tar.gz' else: try: assert os.path.isdir(os.path.join(self.save_path, filename)) except AssertionError: print("The file %s was not found in the location you gave" % filename) raise self.save_path = os.path.join(self.save_path, filename) self.dense = dense expression_data, gene_names = self.download_and_preprocess() super().__init__(*GeneExpressionDataset.get_attributes_from_matrix( expression_data), gene_names=gene_names)
[docs] def preprocess(self): print("Preprocessing dataset") path = self.save_path if self.remote: if len(os.listdir(self.save_path)) == 1: # nothing extracted yet print("Extracting tar file") tar = tarfile.open( os.path.join(self.save_path, self.download_name), "r:gz") tar.extractall(path=self.save_path) tar.close() path = (os.path.join(self.save_path, [name for name in os.listdir(self.save_path) if os.path.isdir(os.path.join(self.save_path, name))][0])) path = os.path.join(path, os.listdir(path)[0]) genes_info = pd.read_csv( os.path.join(path, 'genes.tsv'), sep='\t', header=None) gene_names = genes_info.values[:, self.genecol].astype(np.str).ravel() if os.path.exists(os.path.join(path, 'barcodes.tsv')): self.barcodes = pd.read_csv(os.path.join(path, 'barcodes.tsv'), sep='\t', header=None) # print(genes_info) # self.gene_symbols = \ # genes_info.values[:, self.genecol].astype(np.str).ravel() expression_data = io.mmread(os.path.join(path, 'matrix.mtx')).T if self.dense: expression_data = expression_data.A else: expression_data = csr_matrix(expression_data) print("Finished preprocessing dataset") return expression_data, gene_names
[docs]class BrainSmallDataset(Dataset10X): r""" This dataset consists in 9,128 mouse brain cells profiled using `10x Genomics`_ is used as a complement of PBMC for our study of zero abundance and quality control metrics correlation with our generative posterior parameters. We derived quality control metrics using the cellrangerRkit R package (v.1.1.0). Quality metrics were extracted from CellRanger throughout the molecule specific information file. We kept the top 3000 genes by variance. We used the clusters provided by cellRanger for the correlation analysis of zero probabilities. :param save_path: Save path of raw data file. Examples:: gene_dataset = BrainSmallDataset() .. _10x Genomics: https://support.10xgenomics.com/single-cell-gene-expression/datasets """ def __init__(self, save_path='data/'): dataset = Dataset10X(filename="neuron_9k", save_path=save_path) self.save_path = save_path self.urls = ['https://github.com/YosefLab/scVI-data/raw/master/' 'brain_small_metadata.pickle'] self.download_names = ['brain_small_metadata.pickle'] self.download() metadata = pickle.load(open(os.path.join( self.save_path, 'brain_small_metadata.pickle'), 'rb')) labels = metadata['clusters'].loc[dataset.barcodes.values.ravel()] - 1 self.raw_qc = metadata['raw_qc'].loc[dataset.barcodes.values.ravel()] self.qc_names = self.raw_qc.columns self.qc = self.raw_qc.values GeneExpressionDataset.__init__( self, dataset.Y, batch_indices=dataset.batch_indices, labels=labels)