Source code for neuralee.dataset.loom

import loompy
import numpy as np
import os
from .dataset import GeneExpressionDataset


[docs]class LoomDataset(GeneExpressionDataset): r""" Loads a `.loom` file. :param filename: Name of the `.loom` file. :param save_path: Save path of the dataset. :param url: Url of the remote dataset. Examples:: # Loading a remote dataset remote_loom_dataset = LoomDataset( "osmFISH_SScortex_mouse_all_cell.loom", save_path='data/', url='http://linnarssonlab.org/osmFISH/' 'osmFISH_SScortex_mouse_all_cells.loom') # Loading a local dataset local_loom_dataset = LoomDataset( "osmFISH_SScortex_mouse_all_cell.loom", save_path='data/') """ def __init__(self, filename, save_path='data/', url=None): self.download_name = filename self.save_path = save_path self.url = url self.has_gene, self.has_batch, self.has_cluster = False, False, False data, batch_indices, labels, gene_names, cell_types = \ self.download_and_preprocess() X, batch_indices_, labels = \ GeneExpressionDataset.get_attributes_from_matrix( data, labels=labels) batch_indices = batch_indices if batch_indices is not None \ else batch_indices_ super().__init__(X, batch_indices, labels, gene_names=gene_names, cell_types=cell_types)
[docs] def preprocess(self): print("Preprocessing dataset") gene_names, labels, batch_indices, cell_types = None, None, None, None ds = loompy.connect(os.path.join(self.save_path, self.download_name)) # Take out cells that doesn't express any gene select = ds[:, :].sum(axis=0) > 0 if 'Gene' in ds.ra: gene_names = ds.ra['Gene'] if 'BatchID' in ds.ca: batch_indices = ds.ca['BatchID'] batch_indices = \ np.reshape(batch_indices, (batch_indices.shape[0], 1))[select] if 'ClusterID' in ds.ca: labels = np.array(ds.ca['ClusterID']) labels = np.reshape(labels, (labels.shape[0], 1))[select] if 'CellTypes' in ds.attrs: cell_types = np.array(ds.attrs['CellTypes']) data = ds[:, select].T # change matrix to cells by genes ds.close() print("Finished preprocessing dataset") return data, batch_indices, labels, gene_names, cell_types
[docs]class RetinaDataset(LoomDataset): r""" Loads retina dataset. The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and 13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author. :param save_path: Save path of raw data file. Examples:: gene_dataset = RetinaDataset() """ def __init__(self, save_path='data/'): super().__init__(filename='retina.loom', save_path=save_path, url='https://github.com/YosefLab/scVI-data/raw/' 'master/retina.loom') self.cell_types = ["RBC", "MG", "BC5A", "BC7", "BC6", "BC5C", "BC1A", "BC3B", "BC1B", "BC2", "BC5D", "BC3A", "BC5B", "BC4", "BC8_9"]