Source code for neuralee.dataset.brain_large

import h5py
import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn.preprocessing import StandardScaler
import os

from .dataset import GeneExpressionDataset

batch_idx_10x = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
                 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1]


[docs]class BrainLargeDataset(GeneExpressionDataset): r""" Loads brain-large dataset. This dataset contains 1.3 million brain cells from `10x Genomics`_. We randomly shuffle the data to get a 1M subset of cells and order genes by variance to retain first 10,000 and then 720 sampled variable genes. :param save_path: Save path of raw data file. Examples:: gene_dataset = BrainLargeDataset() .. _10x Genomics: https://support.10xgenomics.com/single-cell-gene-expression/datasets """ def __init__(self, subsample_size=None, save_path='data/', nb_genes_kept=720, max_cells=None): self.max_cells = max_cells self.subsample_size = subsample_size self.save_path = save_path self.nb_genes_kept = nb_genes_kept self.url = "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/" \ "1M_neurons/1M_neurons_filtered_gene_bc_matrices_h5.h5" # originally: "1M_neurons_filtered_gene_bc_matrices_h5.h5" self.download_name = "genomics.h5" Xs = self.download_and_preprocess() super().__init__(Xs)
[docs] def preprocess(self): print("Preprocessing Brain Large data") filtered_matrix_h5 = os.path.join(self.save_path, self.download_name) with h5py.File(filtered_matrix_h5) as f: dset = f["mm10"] n_genes, n_cells = f["mm10"]["shape"] if self.subsample_size is None: self.subsample_size = n_cells indptr = dset['indptr'][...] ns_cells = min(100000, n_cells) # TODO : remove ns_indptr = indptr[:(ns_cells + 1)] ns_nnz = ns_indptr[-1] ns_data = dset["data"][:ns_nnz].astype(np.float32) ns_indices = dset["data"][:ns_nnz] ns_sparse = csr_matrix((ns_data, ns_indices, ns_indptr), shape=(ns_cells, n_genes)) ns_dense = np.log(1 + ns_sparse.toarray()) # Use standard scaler to order select genes by variance std_scaler = StandardScaler(with_mean=False) std_scaler.fit(ns_dense) subset_genes = \ np.argsort(std_scaler.var_)[::-1][:self.nb_genes_kept] nb_matrices = [] nb_cells = 100000 nb_iters = int(self.subsample_size / nb_cells) + \ (self.subsample_size % nb_cells > 0) for i in range(nb_iters): nb_indptr = indptr[(i * nb_cells):((1 + i) * nb_cells + 1)] nb_nnz_a = nb_indptr[0] nb_nnz_b = nb_indptr[-1] nb_indptr = (nb_indptr - nb_nnz_a).astype(np.int32) nb2_cells = len(nb_indptr) - 1 nb_data = dset["data"][nb_nnz_a:nb_nnz_b].astype(np.float32) nb_indices = \ dset["indices"][nb_nnz_a:nb_nnz_b].astype(np.int32) nb_sparse = csr_matrix( (nb_data, nb_indices, nb_indptr), shape=(nb2_cells, n_genes)) nb_filtered = nb_sparse[:, subset_genes] del nb_sparse nb_matrices.append(nb_filtered) print("loaded {} / {} cells".format(i * nb_cells + nb2_cells, self.subsample_size)) if self.max_cells and \ i * nb_cells + nb2_cells >= self.max_cells: break matrix = vstack(nb_matrices) good_cells = matrix.sum(axis=1) > 0 good_cells = np.squeeze(np.asarray(good_cells)) print("excluding {} cells with zero genes expressed".format( len(good_cells) - good_cells.sum())) matrix = matrix[good_cells, :] print("%d cells subsampled" % matrix.shape[0]) print("%d genes subsampled" % matrix.shape[1]) return matrix