Source code for neuralee.dataset.csv

from .dataset import GeneExpressionDataset
import pandas as pd
import numpy as np
import os


[docs]class CsvDataset(GeneExpressionDataset):
    r""" Loads a `.csv` file.

    :param filename: Name of the `.csv` file.
    :param save_path: Save path of the dataset.
    :param url: Url of the remote dataset.
    :param new_n_genes: Number of subsampled genes.
    :param subset_genes: List of genes for subsampling.
    :param compression: For on-the-fly decompression of on-disk data.
     If ‘infer’ and filepath_or_bufferis path-like, then detect compression
     from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’
     (otherwise no decompression). If using ‘zip’, the ZIP file must
     contain only one data file to be read in.
    :param batch_ids_file: Name of the `.csv` file with batch indices.
     File contains two columns. The first holds gene names and second
     holds batch indices - type int. The first row of the file is header.

    Examples::

        # Loading a remote dataset
        remote_url = "https://www.ncbi.nlm.nih.gov/geo/download/" \
        "?acc=GSE100866&format=file&file=" \
        "GSE100866%5FCBMC%5F8K%5F13AB%5F10X%2DRNA%5Fumi%2Ecsv%2Egz")
        remote_csv_dataset = CsvDataset(
            "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", save_path='data/',
            compression='gzip', url=remote_url)
        # Loading a local dataset
        local_csv_dataset = CsvDataset(
            "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz",
            save_path='data/', compression='gzip')

    """

    def __init__(self, filename, save_path='data/', url=None, new_n_genes=600,
                 subset_genes=None, compression=None, sep=',',
                 gene_by_cell=True, labels_file=None, batch_ids_file=None):
        self.download_name = filename  # The given csv file is
        self.save_path = save_path
        self.url = url
        self.compression = compression
        self.sep = sep
        # Whether the original dataset is genes by cells
        self.gene_by_cell = gene_by_cell
        self.labels_file = labels_file
        self.batch_ids_file = batch_ids_file

        data, gene_names, labels, cell_types, batch_ids = \
            self.download_and_preprocess()

        super().__init__(
            *GeneExpressionDataset.get_attributes_from_matrix(
                data, labels=labels,
                batch_indices=batch_ids if batch_ids is not None else 0),
            gene_names=gene_names, cell_types=cell_types)

        self.subsample_genes(new_n_genes, subset_genes)

[docs]    def preprocess(self):
        print("Preprocessing dataset")

        if self.gene_by_cell:
            data = pd.read_csv(
                os.path.join(self.save_path, self.download_name),
                sep=self.sep, index_col=0, compression=self.compression).T
        else:
            data = pd.read_csv(
                os.path.join(self.save_path, self.download_name),
                sep=self.sep, index_col=0, compression=self.compression)

        gene_names = np.array(data.columns, dtype=str)
        labels, cell_types, batch_ids = None, None, None
        if self.labels_file is not None:
            labels = pd.read_csv(
                os.path.join(self.save_path, self.labels_file), header=0,
                index_col=0)
            labels = labels.values
            cell_types = np.unique(labels)

        if self.batch_ids_file is not None:
            batch_ids = pd.read_csv(
                os.path.join(self.save_path, self.batch_ids_file), header=0,
                index_col=0)
            batch_ids = batch_ids.values

        data = data.values
        print("Finished preprocessing dataset")
        return data, gene_names, labels, cell_types, batch_ids


[docs]class BreastCancerDataset(CsvDataset):
    def __init__(self, save_path='data/'):
        super().__init__("Layer2_BC_count_matrix-1.tsv", save_path=save_path,
                         url="http://www.spatialtranscriptomicsresearch.org/"
                             "wp-content/uploads/2016/07/"
                             "Layer2_BC_count_matrix-1.tsv",
                         sep='\t', gene_by_cell=False)


[docs]class MouseOBDataset(CsvDataset):
    def __init__(self, save_path='data/'):
        super().__init__("Rep11_MOB_count_matrix-1.tsv", save_path=save_path,
                         url="http://www.spatialtranscriptomicsresearch.org/"
                             "wp-content/uploads/2016/07/"
                             "Rep11_MOB_count_matrix-1.tsv",
                         sep='\t', gene_by_cell=False)