Source code for neuralee.dataset.anndata

from .dataset import GeneExpressionDataset
import anndata
import numpy as np
import os


[docs]class AnnDataset(GeneExpressionDataset): r"""Loads a `.h5ad` file . AnnDataset class supports loading `Anndata`_ object. :param filename: Name of the `.h5ad` file. :param save_path: Save path of the dataset. :param url: Url of the remote dataset. :param new_n_genes: Number of subsampled genes. :param subset_genes: List of genes for subsampling. Examples:: # Loading a local dataset local_ann_dataset = AnnDataset( "TM_droplet_mat.h5ad", save_path = 'data/') .. _Anndata: http://anndata.readthedocs.io/en/latest/ """ def __init__(self, filename, save_path='data/', url=None, new_n_genes=False, subset_genes=None): self.download_name = filename self.save_path = save_path self.url = url data, gene_names = self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix(data), gene_names=gene_names) self.subsample_genes( new_n_genes=new_n_genes, subset_genes=subset_genes)
[docs] def preprocess(self): print("Preprocessing dataset") # obs = cells, var = genes ad = anndata.read_h5ad( os.path.join(self.save_path, self.download_name)) # provide access to observation annotations # from the underlying AnnData object. self.obs = ad.obs gene_names = np.array(ad.var.index.values, dtype=str) if isinstance(ad.X, np.ndarray): data = ad.X.copy() # Dense else: data = ad.X.toarray() # Sparse # Take out cells that doesn't express any gene select = data.sum(axis=1) > 0 data = data[select, :] print("Finished preprocessing dataset") return data, gene_names