From d19b8cc2bed1b362a11b0c2cb8a2116515a6ed43 Mon Sep 17 00:00:00 2001 From: jfnavarro Date: Mon, 4 Sep 2017 16:57:47 +0200 Subject: [PATCH] Fixed some bugs --- CHANGELOG | 5 ++- scripts/differential_analysis.py | 17 +-------- scripts/unsupervised.py | 23 ++++++------ setup.py | 2 +- stanalysis/normalization.py | 10 +++--- stanalysis/preprocessing.py | 62 ++++++++++++++++++++------------ 6 files changed, 62 insertions(+), 57 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c204888..9ec0d2f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -29,4 +29,7 @@ Version 0.3.3 Version 0.4.0 * Removed deprecated scripts -* Adjusted some formats and parameters \ No newline at end of file +* Adjusted some formats and parameters + +Version 0.4.1 +* Fixed a bug in the noise filtering function \ No newline at end of file diff --git a/scripts/differential_analysis.py b/scripts/differential_analysis.py index 92c4aeb..067bf3c 100644 --- a/scripts/differential_analysis.py +++ b/scripts/differential_analysis.py @@ -39,21 +39,6 @@ from rpy2.robjects import pandas2ri, r, numpy2ri robjects.conversion.py2ri = numpy2ri import matplotlib.pyplot as plt - -def get_classes_coordinate(class_file): - """ Helper function - to get a dictionary of spot -> class - from a tab delimited file - """ - barcodes_classes = dict() - with open(class_file, "r") as filehandler: - for line in filehandler.readlines(): - tokens = line.split() - assert(len(tokens) == 2) - spot = tokens[1] - class_label = tokens[0] - barcodes_classes[spot] = class_label - return barcodes_classes def dea(counts, conds, size_factors=None): """Makes a call to DESeq2 to @@ -114,7 +99,7 @@ def main(counts_table_files, data_classes, for line in filehandler.readlines(): tokens = line.split() assert(len(tokens) == 2) - spot_classes["{}_{}".format(i,tokens[1])] = str(tokens[0]) + spot_classes["{}_{}".format(i,tokens[0])] = str(tokens[1]) # Remove noise counts = remove_noise(counts, 0.01, 0.01, 1) diff --git a/scripts/unsupervised.py b/scripts/unsupervised.py index 2ae7bc0..debcc29 100644 --- a/scripts/unsupervised.py +++ b/scripts/unsupervised.py @@ -126,7 +126,7 @@ def main(counts_table_files, counts = aggregate_datatasets(counts_table_files) # Remove noisy spots and genes (Spots are rows and genes are columns) - counts = remove_noise(counts, num_exp_genes, num_exp_spots) + counts = remove_noise(counts, num_exp_genes / 100.0, num_exp_spots / 100.0) if num_clusters is None: num_clusters = computeNClusters(counts) @@ -137,10 +137,9 @@ def main(counts_table_files, center_size_factors = not use_adjusted_log norm_counts = normalize_data(counts, normalization, center=center_size_factors, adjusted_log=use_adjusted_log) - + # Keep top genes (variance or expressed) - num_genes_keep = num_genes_keep / 100.0 - norm_counts = keep_top_genes(norm_counts, num_genes_keep, criteria=top_genes_criteria) + norm_counts = keep_top_genes(norm_counts, num_genes_keep / 100.0, criteria=top_genes_criteria) if use_log_scale: print "Using pseudo-log counts log2(counts + 1)" @@ -196,7 +195,7 @@ def main(counts_table_files, y = float(tokens[1]) x = float(tokens[0].split("_")[1]) index = int(tokens[0].split("_")[0]) - file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y)), labels[i]) + file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y), labels[i])) # Compute a color_label based on the RGB representation of the 3D dimensionality reduced labels_colors = list() @@ -315,12 +314,12 @@ def main(counts_table_files, parser.add_argument("--num-clusters", default=None, metavar="[INT]", type=int, choices=range(2, 10), help="The number of clusters/regions expected to be found.\n" \ "If not given the number of clusters will be computed.") - parser.add_argument("--num-exp-genes", default=5, metavar="[INT]", type=int, choices=range(1, 100), - help="The number of expressed genes (!= 0) a spot\n" \ - "must have to be kept (default: %(default)s)") - parser.add_argument("--num-exp-spots", default=5, metavar="[INT]", type=int, choices=range(1, 100), - help="The number of expressed spots (!= 0) a gene " \ - "must have to be kept (default: %(default)s)") + parser.add_argument("--num-exp-genes", default=10, metavar="[INT]", type=int, choices=range(0, 100), + help="The percentage of number of expressed genes (!= 0) a spot\n" \ + "must have to be kept from the distribution of all expressed genes (default: %(default)s)") + parser.add_argument("--num-exp-spots", default=1, metavar="[INT]", type=int, choices=range(0, 100), + help="The percentage of number of expressed spots (!= 0) a gene " \ + "must have to be kept from the total number of spots (default: %(default)s)") parser.add_argument("--num-genes-keep", default=20, metavar="[INT]", type=int, choices=range(0, 100), help="The percentage of genes to discard from the distribution of all the genes\n" \ "across all the spots using the variance or the top expression " \ @@ -356,7 +355,7 @@ def main(counts_table_files, "the dimensionality reduction (Variance or TopRanked) (default: %(default)s)") parser.add_argument("--use-adjusted-log", action="store_true", default=False, help="Use adjusted log normalized counts (R Scater::normalized()) " - "in the dimensionality reduction step") + "in the dimensionality reduction step (recommended with SCRAN normalization)") parser.add_argument("--outdir", default=None, help="Path to output dir") args = parser.parse_args() main(args.counts_table_files, diff --git a/setup.py b/setup.py index dab23cf..021258f 100755 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name = 'stanalysis', - version = "0.4.0", + version = "0.4.1", description = __doc__.split("\n", 1)[0], long_description = long_description, keywords = 'rna-seq analysis machine_learning spatial transcriptomics toolkit', diff --git a/stanalysis/normalization.py b/stanalysis/normalization.py index 1808657..272c209 100644 --- a/stanalysis/normalization.py +++ b/stanalysis/normalization.py @@ -30,10 +30,11 @@ def computeTMMFactors(counts): r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] - dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM") * r.colSums(counts) + dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM") pandas_sf = pandas2ri.ri2py(dds) + pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() - return pandas_sf + return pandas_sf * pandas_cm def computeRLEFactors(counts): """ Compute normalization size factors @@ -45,10 +46,11 @@ def computeRLEFactors(counts): r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] - dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") * r.colSums(counts) + dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") pandas_sf = pandas2ri.ri2py(dds) + pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() - return pandas_sf + return pandas_sf * pandas_cm def computeSumFactors(counts): """ Compute normalization factors diff --git a/stanalysis/preprocessing.py b/stanalysis/preprocessing.py index 7dfd3f4..b6b5420 100644 --- a/stanalysis/preprocessing.py +++ b/stanalysis/preprocessing.py @@ -5,6 +5,7 @@ """ import numpy as np import pandas as pd +import math import os from stanalysis.normalization import * @@ -41,18 +42,18 @@ def aggregate_datatasets(counts_table_files, plot_hist=False): counts.fillna(0.0, inplace=True) return counts -def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1): +def remove_noise(counts, num_exp_genes=0.01, num_exp_spots=0.01, min_expression=1): """This functions remove noisy genes and spots for a given data frame (Genes as columns and spots as rows). - - The noisy spots are removed by counting how many expressed genes > 0 - a spot has. - - The noisy genes are removed by counting how many expressed spots >= min_expression - a gene has. - :param counts: a Pandas data frame with the counts (genes as columns) - :param num_exp_genes: an integer representing - the total number of genes that a spot must have with a count bigger - than 0 in order to be kept - :param num_exp_spots: an integer representing + - The noisy spots are removed so to keep a percentage + of the total distribution of spots whose gene counts are not 0 + The percentage is given as a parameter. + - The noisy genes are removed so every gene that is expressed + in less than 1% of the total spots. Expressed with a count >= 2. + :param counts: a Pandas data frame with the counts + :param num_exp_genes: a float from 0-1 representing the % of + the distribution of expressed genes a spot must have to be kept + :param num_exp_spots: a float from 0-1 representing the % of the total number of spots that a gene must have with a count bigger than the parameter min_expression in order to be kept :param min_expression: the minimum expression for a gene to be @@ -62,20 +63,23 @@ def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1): # How many spots do we keep based on the number of genes expressed? num_spots = len(counts.index) - print "Removing spots that are expressed in less than {} genes".format(nnum_exp_genes) - counts = counts[(counts != 0).sum(axis=1) >= num_exp_genes] + num_genes = len(counts.columns) + min_genes_spot_exp = round((counts != 0).sum(axis=1).quantile(num_exp_genes)) + print "Number of expressed genes a spot must have to be kept " \ + "({}% of total expressed genes) {}".format(num_exp_genes,min_genes_spot_exp) + counts = counts[(counts != 0).sum(axis=1) >= min_genes_spot_exp] print "Dropped {} spots".format(num_spots - len(counts.index)) # Spots are columns and genes are rows counts = counts.transpose() # Remove noisy genes - num_genes = len(counts.columns) - print "Removing genes that are expressed in less than {} spots".format(num_exp_spots) - counts = counts[(counts >= min_expression).sum(axis=1) >= num_exp_spots] + min_features_gene = round(len(counts.columns) * num_exp_spots) + print "Removing genes that are expressed in less than {} " \ + "spots with a count of at least {}".format(min_features_gene, min_expression) + counts = counts[(counts >= min_expression).sum(axis=1) >= min_features_gene] print "Dropped {} genes".format(num_genes - len(counts.index)) - # return the filtered matrix return counts.transpose() def keep_top_genes(counts, num_genes_keep, criteria="Variance"): @@ -86,22 +90,29 @@ def keep_top_genes(counts, num_genes_keep, criteria="Variance"): :param counts: a Pandas data frame with the counts :param num_genes_keep: the % (1-100) of genes to keep :param criteria: the criteria used to select ("Variance or "TopRanked") - :return: a new Pandas data frame with only the top raned genes. + :return: a new Pandas data frame with only the top ranked genes. """ # Spots as columns and genes as rows counts = counts.transpose() # Keep only the genes with higher over-all variance num_genes = len(counts.index) + print "Removing {}% of genes based on the {}".format(num_genes_keep * 100, criteria) if criteria == "Variance": min_genes_spot_var = counts.var(axis=1).quantile(num_genes_keep) - print "Min normalized variance a gene must have over all spots " \ - "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var) - counts = counts[counts.var(axis=1) >= min_genes_spot_var] + if math.isnan(min_genes_spot_var): + print "Computed variance is NaN! Check your normalization factors.." + else: + print "Min normalized variance a gene must have over all spots " \ + "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var) + counts = counts[counts.var(axis=1) >= min_genes_spot_var] elif criteria == "TopRankded": min_genes_spot_sum = counts.sum(axis=1).quantile(num_genes_keep) - print "Min normalized total count a gene must have over all spots " \ - "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum) - counts = counts[counts.sum(axis=1) >= min_genes_spot_var] + if math.isnan(min_genes_spot_var): + print "Computed sum is NaN! Check your normalization factors.." + else: + print "Min normalized total count a gene must have over all spots " \ + "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum) + counts = counts[counts.sum(axis=1) >= min_genes_spot_var] else: raise RunTimeError("Error, incorrect criteria method\n") print "Dropped {} genes".format(num_genes - len(counts.index)) @@ -134,6 +145,9 @@ def compute_size_factors(counts, normalization): if np.isnan(size_factors).any() or np.isinf(size_factors).any(): print "Warning: Computed size factors contained NaN or Inf. The data will not be normalized!" size_factors = np.ones(len(size_factors)) + elif (size_factors <= 0.0).any(): + print "Warning: Computed size factors contained zeroes or negative values.\nThey will be replaced by epsilon!" + size_factors[size_factors <= 0.0] = np.finfo(np.float32).eps return size_factors def normalize_data(counts, normalization, center=False, adjusted_log=False): @@ -150,6 +164,8 @@ def normalize_data(counts, normalization, center=False, adjusted_log=False): """ # Compute the size factors size_factors = compute_size_factors(counts, normalization) + if np.all(size_factors == 1.0): + return counts # Spots as columns and genes as rows counts = counts.transpose() # Center and/or adjust log the size_factors and counts