Source code for neuralee.dataset.brain_large

import h5py
import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn.preprocessing import StandardScaler
import os

from .dataset import GeneExpressionDataset

batch_idx_10x = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
                 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1]


[docs]class BrainLargeDataset(GeneExpressionDataset):
    r""" Loads brain-large dataset.

    This dataset contains 1.3 million brain cells from `10x Genomics`_.
    We randomly shuffle the data to get a 1M subset of cells and order genes
    by variance to retain first 10,000 and then 720 sampled variable genes.

    :param save_path: Save path of raw data file.

    Examples::

        gene_dataset = BrainLargeDataset()

    .. _10x Genomics:
        https://support.10xgenomics.com/single-cell-gene-expression/datasets

    """

    def __init__(self, subsample_size=None, save_path='data/',
                 nb_genes_kept=720, max_cells=None):
        self.max_cells = max_cells
        self.subsample_size = subsample_size
        self.save_path = save_path
        self.nb_genes_kept = nb_genes_kept
        self.url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/" \
                   "1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5"
        # originally: "1M_neurons_filtered_gene_bc_matrices_h5.h5"

        self.download_name = "genomics.h5"
        Xs = self.download_and_preprocess()
        super().__init__(Xs)

[docs]    def preprocess(self):
        print("Preprocessing Brain Large data")

        filtered_matrix_h5 = os.path.join(self.save_path, self.download_name)
        with h5py.File(filtered_matrix_h5) as f:
            dset = f["mm10"]
            n_genes, n_cells = f["mm10"]["shape"]
            if self.subsample_size is None:
                self.subsample_size = n_cells
            indptr = dset['indptr'][...]

            ns_cells = min(100000, n_cells)  # TODO : remove
            ns_indptr = indptr[:(ns_cells + 1)]
            ns_nnz = ns_indptr[-1]
            ns_data = dset["data"][:ns_nnz].astype(np.float32)
            ns_indices = dset["data"][:ns_nnz]
            ns_sparse = csr_matrix((ns_data, ns_indices, ns_indptr),
                                   shape=(ns_cells, n_genes))
            ns_dense = np.log(1 + ns_sparse.toarray())

            # Use standard scaler to order select genes by variance
            std_scaler = StandardScaler(with_mean=False)
            std_scaler.fit(ns_dense)
            subset_genes = \
                np.argsort(std_scaler.var_)[::-1][:self.nb_genes_kept]

            nb_matrices = []
            nb_cells = 100000
            nb_iters = int(self.subsample_size / nb_cells) + \
                (self.subsample_size % nb_cells > 0)
            for i in range(nb_iters):
                nb_indptr = indptr[(i * nb_cells):((1 + i) * nb_cells + 1)]
                nb_nnz_a = nb_indptr[0]
                nb_nnz_b = nb_indptr[-1]
                nb_indptr = (nb_indptr - nb_nnz_a).astype(np.int32)
                nb2_cells = len(nb_indptr) - 1
                nb_data = dset["data"][nb_nnz_a:nb_nnz_b].astype(np.float32)
                nb_indices = \
                    dset["indices"][nb_nnz_a:nb_nnz_b].astype(np.int32)
                nb_sparse = csr_matrix(
                    (nb_data, nb_indices, nb_indptr),
                    shape=(nb2_cells, n_genes))
                nb_filtered = nb_sparse[:, subset_genes]
                del nb_sparse
                nb_matrices.append(nb_filtered)
                print("loaded {} / {} cells".format(i * nb_cells + nb2_cells,
                                                    self.subsample_size))
                if self.max_cells and \
                   i * nb_cells + nb2_cells >= self.max_cells:
                    break

        matrix = vstack(nb_matrices)
        good_cells = matrix.sum(axis=1) > 0
        good_cells = np.squeeze(np.asarray(good_cells))
        print("excluding {} cells with zero genes expressed".format(
            len(good_cells) - good_cells.sum()))
        matrix = matrix[good_cells, :]

        print("%d cells subsampled" % matrix.shape[0])
        print("%d genes subsampled" % matrix.shape[1])

        return matrix