Source code for neuralee.dataset.dataset10X

# On the 10X website:
# The main categories (Cell Ranger 1.1.0 / Cell Ranger 2.1.0 / ...)
# have same access suffix for each of their dataset.
# For dataset name (eg. 'pbmc8k', 'pbmc4k', ect...) their are two available
# specifications, either filtered or raw data
import os
import pickle
import tarfile

import numpy as np
import pandas as pd
from scipy import io
from scipy.sparse import csr_matrix

from .dataset import GeneExpressionDataset

available_datasets = {"1.1.0":
                      ["frozen_pbmc_donor_a",
                       "frozen_pbmc_donor_b",
                       "frozen_pbmc_donor_c",
                       "fresh_68k_pbmc_donor_a",
                       "cd14_monocytes",
                       "b_cells",
                       "cd34",
                       "cd56_nk",
                       "cd4_t_helper",
                       "regulatory_t",
                       "naive_t",
                       "memory_t",
                       "cytotoxic_t",
                       "naive_cytotoxic"
                       ],
                      "2.1.0":
                      ["pbmc8k",
                       "pbmc4k",
                       "t_3k",
                       "t_4k",
                       "neuron_9k"]}

to_groups = dict([(dataset_name, group)
                  for group, list_datasets in available_datasets.items()
                  for dataset_name in list_datasets])
available_specification = ['filtered', 'raw']


[docs]class Dataset10X(GeneExpressionDataset):
    r""" Loads a file from `10x`_ website.

    :param filename: Name of the dataset file.
    :param save_path: Save path of the dataset.
    :param type: Either `filtered` data or `raw` data.
    :param subset_genes: List of genes for subsampling.
    :param dense: Whether to load as dense or sparse.
    :param remote: Whether the 10X dataset is to be downloaded from the website
     or whether it is a local dataset, if remote is False
     then os.path.join(save_path, filename) must be the path
     to the directory that contains matrix.mtx and genes.tsv files

    Examples::

        tenX_dataset = Dataset10X("neuron_9k")

    .. _10x:
        http://cf.10xgenomics.com/

    """

    def __init__(self, filename, save_path='data/', type='filtered',
                 dense=False, remote=True, genecol=0):

        self.remote = remote
        self.save_path = save_path
        self.genecol = genecol
        if self.remote:
            group = to_groups[filename]
            self.url = ("http://cf.10xgenomics.com/samples/cell-exp"
                        "/%s/%s/%s_%s_gene_bc_matrices.tar.gz" %
                        (group, filename, filename, type))
            self.save_path = os.path.join(save_path, '10X/%s/' % filename)
            self.save_name = '%s_gene_bc_matrices' % type
            self.download_name = self.save_name + '.tar.gz'
        else:
            try:
                assert os.path.isdir(os.path.join(self.save_path, filename))
            except AssertionError:
                print("The file %s was not found in the location you gave" %
                      filename)
                raise
            self.save_path = os.path.join(self.save_path, filename)

        self.dense = dense

        expression_data, gene_names = self.download_and_preprocess()
        super().__init__(*GeneExpressionDataset.get_attributes_from_matrix(
            expression_data), gene_names=gene_names)

[docs]    def preprocess(self):
        print("Preprocessing dataset")
        path = self.save_path
        if self.remote:
            if len(os.listdir(self.save_path)) == 1:  # nothing extracted yet
                print("Extracting tar file")
                tar = tarfile.open(
                    os.path.join(self.save_path, self.download_name), "r:gz")
                tar.extractall(path=self.save_path)
                tar.close()

            path = (os.path.join(self.save_path,
                    [name for name in os.listdir(self.save_path)
                     if os.path.isdir(os.path.join(self.save_path, name))][0]))
            path = os.path.join(path, os.listdir(path)[0])
        genes_info = pd.read_csv(
            os.path.join(path, 'genes.tsv'), sep='\t', header=None)
        gene_names = genes_info.values[:, self.genecol].astype(np.str).ravel()
        if os.path.exists(os.path.join(path, 'barcodes.tsv')):
            self.barcodes = pd.read_csv(os.path.join(path, 'barcodes.tsv'),
                                        sep='\t', header=None)
        # print(genes_info)
        # self.gene_symbols = \
        #     genes_info.values[:, self.genecol].astype(np.str).ravel()
        expression_data = io.mmread(os.path.join(path, 'matrix.mtx')).T
        if self.dense:
            expression_data = expression_data.A
        else:
            expression_data = csr_matrix(expression_data)

        print("Finished preprocessing dataset")
        return expression_data, gene_names


[docs]class BrainSmallDataset(Dataset10X):
    r"""
    This dataset consists in 9,128 mouse brain cells profiled using
    `10x Genomics`_ is used as a complement of PBMC for our study of zero
    abundance and quality control metrics correlation with our generative
    posterior parameters. We derived quality control metrics using the
    cellrangerRkit R package (v.1.1.0). Quality metrics were extracted from
    CellRanger throughout the molecule specific information file. We kept the
    top 3000 genes by variance. We used the clusters provided by cellRanger
    for the correlation analysis of zero probabilities.

    :param save_path: Save path of raw data file.

    Examples::

        gene_dataset = BrainSmallDataset()

    .. _10x Genomics:
        https://support.10xgenomics.com/single-cell-gene-expression/datasets

    """

    def __init__(self, save_path='data/'):
        dataset = Dataset10X(filename="neuron_9k", save_path=save_path)
        self.save_path = save_path
        self.urls = ['https://github.com/YosefLab/scVI-data/raw/master/'
                     'brain_small_metadata.pickle']
        self.download_names = ['brain_small_metadata.pickle']
        self.download()

        metadata = pickle.load(open(os.path.join(
            self.save_path, 'brain_small_metadata.pickle'), 'rb'))
        labels = metadata['clusters'].loc[dataset.barcodes.values.ravel()] - 1

        self.raw_qc = metadata['raw_qc'].loc[dataset.barcodes.values.ravel()]
        self.qc_names = self.raw_qc.columns
        self.qc = self.raw_qc.values

        GeneExpressionDataset.__init__(
            self, dataset.Y, batch_indices=dataset.batch_indices,
            labels=labels)