Source code for neuralee.dataset.csv

from .dataset import GeneExpressionDataset
import pandas as pd
import numpy as np
import os


[docs]class CsvDataset(GeneExpressionDataset): r""" Loads a `.csv` file. :param filename: Name of the `.csv` file. :param save_path: Save path of the dataset. :param url: Url of the remote dataset. :param new_n_genes: Number of subsampled genes. :param subset_genes: List of genes for subsampling. :param compression: For on-the-fly decompression of on-disk data. If ‘infer’ and filepath_or_bufferis path-like, then detect compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in. :param batch_ids_file: Name of the `.csv` file with batch indices. File contains two columns. The first holds gene names and second holds batch indices - type int. The first row of the file is header. Examples:: # Loading a remote dataset remote_url = "https://www.ncbi.nlm.nih.gov/geo/download/" \ "?acc=GSE100866&format=file&file=" \ "GSE100866%5FCBMC%5F8K%5F13AB%5F10X%2DRNA%5Fumi%2Ecsv%2Egz") remote_csv_dataset = CsvDataset( "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", save_path='data/', compression='gzip', url=remote_url) # Loading a local dataset local_csv_dataset = CsvDataset( "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", save_path='data/', compression='gzip') """ def __init__(self, filename, save_path='data/', url=None, new_n_genes=600, subset_genes=None, compression=None, sep=',', gene_by_cell=True, labels_file=None, batch_ids_file=None): self.download_name = filename # The given csv file is self.save_path = save_path self.url = url self.compression = compression self.sep = sep # Whether the original dataset is genes by cells self.gene_by_cell = gene_by_cell self.labels_file = labels_file self.batch_ids_file = batch_ids_file data, gene_names, labels, cell_types, batch_ids = \ self.download_and_preprocess() super().__init__( *GeneExpressionDataset.get_attributes_from_matrix( data, labels=labels, batch_indices=batch_ids if batch_ids is not None else 0), gene_names=gene_names, cell_types=cell_types) self.subsample_genes(new_n_genes, subset_genes)
[docs] def preprocess(self): print("Preprocessing dataset") if self.gene_by_cell: data = pd.read_csv( os.path.join(self.save_path, self.download_name), sep=self.sep, index_col=0, compression=self.compression).T else: data = pd.read_csv( os.path.join(self.save_path, self.download_name), sep=self.sep, index_col=0, compression=self.compression) gene_names = np.array(data.columns, dtype=str) labels, cell_types, batch_ids = None, None, None if self.labels_file is not None: labels = pd.read_csv( os.path.join(self.save_path, self.labels_file), header=0, index_col=0) labels = labels.values cell_types = np.unique(labels) if self.batch_ids_file is not None: batch_ids = pd.read_csv( os.path.join(self.save_path, self.batch_ids_file), header=0, index_col=0) batch_ids = batch_ids.values data = data.values print("Finished preprocessing dataset") return data, gene_names, labels, cell_types, batch_ids
[docs]class BreastCancerDataset(CsvDataset): def __init__(self, save_path='data/'): super().__init__("Layer2_BC_count_matrix-1.tsv", save_path=save_path, url="http://www.spatialtranscriptomicsresearch.org/" "wp-content/uploads/2016/07/" "Layer2_BC_count_matrix-1.tsv", sep='\t', gene_by_cell=False)
[docs]class MouseOBDataset(CsvDataset): def __init__(self, save_path='data/'): super().__init__("Rep11_MOB_count_matrix-1.tsv", save_path=save_path, url="http://www.spatialtranscriptomicsresearch.org/" "wp-content/uploads/2016/07/" "Rep11_MOB_count_matrix-1.tsv", sep='\t', gene_by_cell=False)