import csv
import os
import numpy as np
from .dataset import GeneExpressionDataset
[docs]class CortexDataset(GeneExpressionDataset):
r""" Loads cortex dataset.
The `Mouse Cortex Cells dataset`_ contains 3005 mouse cortex cells and
gold-standard labels for seven distinct cell types. Each cell type
corresponds to a cluster to recover. We retain top 558 genes
ordered by variance.
:param save_path: Save path of raw data file.
Examples::
gene_dataset = CortexDataset()
.. _Mouse Cortex Cells dataset:
https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt
"""
def __init__(self, save_path='data/', genes_to_keep=[], genes_fish=[],
additional_genes=None):
# Generating samples according to a ZINB process
self.save_path = save_path
self.download_name = 'expression.bin'
self.url = "https://storage.googleapis.com/linnarsson-lab-www-blobs/" \
"blobs/cortex/expression_mRNA_17-Aug-2014.txt"
# If we want to harmonize the dataset with the OsmFISH dataset, we need
# to keep OsmFISH genes and order the genes from Cortex accordingly
self.genes_fish = genes_fish
# If there are specific genes we'd like to keep
self.genes_to_keep = genes_to_keep
# Number of genes we want to keep
self.additional_genes = additional_genes
expression_data, labels, gene_names, cell_types = \
self.download_and_preprocess()
super().__init__(
*GeneExpressionDataset.get_attributes_from_matrix(
expression_data,
labels=labels),
gene_names=np.char.upper(gene_names), cell_types=cell_types)
[docs] def preprocess(self):
print("Preprocessing Cortex data")
rows = []
gene_names = []
with open(os.path.join(self.save_path, self.download_name), 'r') \
as csvfile:
data_reader = csv.reader(csvfile, delimiter='\t')
clusters = None
for i, row in enumerate(data_reader):
if i == 1:
precise_clusters = np.array(row, dtype=str)[2:]
if i == 8: # 7 + 1 in pandas
clusters = np.array(row, dtype=str)[2:]
if i >= 11: # 10 + 1 in pandas
rows.append(row[1:])
gene_names.append(row[0])
cell_types, labels = np.unique(clusters, return_inverse=True)
_, self.precise_labels = np.unique(
precise_clusters, return_inverse=True)
expression_data = np.array(rows, dtype=np.int).T[1:]
gene_names = np.array(gene_names, dtype=np.str)
additional_genes = []
for gene_cortex in range(len(gene_names)):
for gene_fish in self.genes_fish:
if gene_names[gene_cortex].lower() == gene_fish.lower():
additional_genes.append(gene_cortex)
for gene_cortex in range(len(gene_names)):
for gene_fish in self.genes_to_keep:
if gene_names[gene_cortex].lower() == gene_fish.lower():
additional_genes.append(gene_cortex)
if self.additional_genes is not None and \
self.additional_genes < expression_data.shape[1]:
selected = np.std(
expression_data,
axis=0).argsort()[-self.additional_genes:][::-1]
selected = np.unique(
np.concatenate((selected, np.array(additional_genes))))
selected = np.array([int(select) for select in selected])
expression_data = expression_data[:, selected]
gene_names = gene_names[selected]
# Then we reorganize the genes so that the genes from the smFISH
# dataset appear first
if len(self.genes_fish) > 0:
expression_data, gene_names = self.reorder_genes(
expression_data, gene_names, self.genes_fish)
umi = np.sum(expression_data[:, :len(self.genes_fish)], axis=1)
expression_data = expression_data[umi > 10, :]
labels = labels[umi > 10]
print("Finished preprocessing Cortex data")
return expression_data, labels, gene_names, cell_types
[docs] @staticmethod
def reorder_genes(x, genes, first_genes):
"""
In case the order of the genes needs to be changed:
puts the gene present in ordered_genes first, conserving
the same order.
"""
# X must be a numpy matrix
new_order_first = []
for ordered_gene in range(len(first_genes)):
for gene in range(len(genes)):
if first_genes[ordered_gene].lower() == genes[gene].lower():
new_order_first.append(gene)
new_order_second = \
[x for x in range(len(genes)) if x not in new_order_first]
new_order = new_order_first + new_order_second
return x[:, new_order], genes[new_order]