From d19b8cc2bed1b362a11b0c2cb8a2116515a6ed43 Mon Sep 17 00:00:00 2001
From: jfnavarro <jc.fernandez.navarro@gmail.com>
Date: Mon, 4 Sep 2017 16:57:47 +0200
Subject: [PATCH] Fixed some bugs

---
 CHANGELOG                        |  5 ++-
 scripts/differential_analysis.py | 17 +--------
 scripts/unsupervised.py          | 23 ++++++------
 setup.py                         |  2 +-
 stanalysis/normalization.py      | 10 +++---
 stanalysis/preprocessing.py      | 62 ++++++++++++++++++++------------
 6 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index c204888..9ec0d2f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -29,4 +29,7 @@ Version 0.3.3
 
 Version 0.4.0
 * Removed deprecated scripts
-* Adjusted some formats and parameters
\ No newline at end of file
+* Adjusted some formats and parameters
+
+Version 0.4.1
+* Fixed a bug in the noise filtering function
\ No newline at end of file
diff --git a/scripts/differential_analysis.py b/scripts/differential_analysis.py
index 92c4aeb..067bf3c 100644
--- a/scripts/differential_analysis.py
+++ b/scripts/differential_analysis.py
@@ -39,21 +39,6 @@
 from rpy2.robjects import pandas2ri, r, numpy2ri
 robjects.conversion.py2ri = numpy2ri
 import matplotlib.pyplot as plt
-
-def get_classes_coordinate(class_file):
-    """ Helper function
-    to get a dictionary of spot -> class 
-    from a tab delimited file
-    """
-    barcodes_classes = dict()
-    with open(class_file, "r") as filehandler:
-        for line in filehandler.readlines():
-            tokens = line.split()
-            assert(len(tokens) == 2)
-            spot = tokens[1]
-            class_label = tokens[0]
-            barcodes_classes[spot] = class_label
-    return barcodes_classes
    
 def dea(counts, conds, size_factors=None):
     """Makes a call to DESeq2 to
@@ -114,7 +99,7 @@ def main(counts_table_files, data_classes,
             for line in filehandler.readlines():
                 tokens = line.split()
                 assert(len(tokens) == 2)
-                spot_classes["{}_{}".format(i,tokens[1])] = str(tokens[0])  
+                spot_classes["{}_{}".format(i,tokens[0])] = str(tokens[1])  
     
     # Remove noise
     counts = remove_noise(counts, 0.01, 0.01, 1)
diff --git a/scripts/unsupervised.py b/scripts/unsupervised.py
index 2ae7bc0..debcc29 100644
--- a/scripts/unsupervised.py
+++ b/scripts/unsupervised.py
@@ -126,7 +126,7 @@ def main(counts_table_files,
     counts = aggregate_datatasets(counts_table_files)
 
     # Remove noisy spots and genes (Spots are rows and genes are columns)
-    counts = remove_noise(counts, num_exp_genes, num_exp_spots)
+    counts = remove_noise(counts, num_exp_genes / 100.0, num_exp_spots / 100.0)
     
     if num_clusters is None:
         num_clusters = computeNClusters(counts)
@@ -137,10 +137,9 @@ def main(counts_table_files,
     center_size_factors = not use_adjusted_log
     norm_counts = normalize_data(counts, normalization, 
                                  center=center_size_factors, adjusted_log=use_adjusted_log)
-        
+
     # Keep top genes (variance or expressed)
-    num_genes_keep = num_genes_keep / 100.0
-    norm_counts = keep_top_genes(norm_counts, num_genes_keep, criteria=top_genes_criteria)
+    norm_counts = keep_top_genes(norm_counts, num_genes_keep / 100.0, criteria=top_genes_criteria)
        
     if use_log_scale:
         print "Using pseudo-log counts log2(counts + 1)"
@@ -196,7 +195,7 @@ def main(counts_table_files,
         y = float(tokens[1])
         x = float(tokens[0].split("_")[1])
         index = int(tokens[0].split("_")[0])
-        file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y)), labels[i])
+        file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y), labels[i]))
         
     # Compute a color_label based on the RGB representation of the 3D dimensionality reduced
     labels_colors = list()
@@ -315,12 +314,12 @@ def main(counts_table_files,
     parser.add_argument("--num-clusters", default=None, metavar="[INT]", type=int, choices=range(2, 10),
                         help="The number of clusters/regions expected to be found.\n" \
                         "If not given the number of clusters will be computed.")
-    parser.add_argument("--num-exp-genes", default=5, metavar="[INT]", type=int, choices=range(1, 100),
-                        help="The number of expressed genes (!= 0) a spot\n" \
-                        "must have to be kept (default: %(default)s)")
-    parser.add_argument("--num-exp-spots", default=5, metavar="[INT]", type=int, choices=range(1, 100),
-                        help="The number of expressed spots (!= 0) a gene " \
-                        "must have to be kept (default: %(default)s)")
+    parser.add_argument("--num-exp-genes", default=10, metavar="[INT]", type=int, choices=range(0, 100),
+                        help="The percentage of number of expressed genes (!= 0) a spot\n" \
+                        "must have to be kept from the distribution of all expressed genes (default: %(default)s)")
+    parser.add_argument("--num-exp-spots", default=1, metavar="[INT]", type=int, choices=range(0, 100),
+                        help="The percentage of number of expressed spots (!= 0) a gene " \
+                        "must have to be kept from the total number of spots (default: %(default)s)")
     parser.add_argument("--num-genes-keep", default=20, metavar="[INT]", type=int, choices=range(0, 100),
                         help="The percentage of genes to discard from the distribution of all the genes\n" \
                         "across all the spots using the variance or the top expression " \
@@ -356,7 +355,7 @@ def main(counts_table_files,
                         "the dimensionality reduction (Variance or TopRanked) (default: %(default)s)")
     parser.add_argument("--use-adjusted-log", action="store_true", default=False,
                         help="Use adjusted log normalized counts (R Scater::normalized()) "
-                        "in the dimensionality reduction step")
+                        "in the dimensionality reduction step (recommended with SCRAN normalization)")
     parser.add_argument("--outdir", default=None, help="Path to output dir")
     args = parser.parse_args()
     main(args.counts_table_files, 
diff --git a/setup.py b/setup.py
index dab23cf..021258f 100755
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 
 setup(
   name = 'stanalysis',
-  version = "0.4.0",
+  version = "0.4.1",
   description = __doc__.split("\n", 1)[0],
   long_description = long_description,
   keywords = 'rna-seq analysis machine_learning spatial transcriptomics toolkit',
diff --git a/stanalysis/normalization.py b/stanalysis/normalization.py
index 1808657..272c209 100644
--- a/stanalysis/normalization.py
+++ b/stanalysis/normalization.py
@@ -30,10 +30,11 @@ def computeTMMFactors(counts):
     r_counts = pandas2ri.py2ri(counts)
     edger = RimportLibrary("edgeR")
     as_matrix = r["as.matrix"]
-    dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM") * r.colSums(counts)
+    dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM")
     pandas_sf = pandas2ri.ri2py(dds)
+    pandas_cm = pandas2ri.ri2py(r.colSums(counts))
     pandas2ri.deactivate()
-    return pandas_sf
+    return pandas_sf * pandas_cm
 
 def computeRLEFactors(counts):
     """ Compute normalization size factors
@@ -45,10 +46,11 @@ def computeRLEFactors(counts):
     r_counts = pandas2ri.py2ri(counts)
     edger = RimportLibrary("edgeR")
     as_matrix = r["as.matrix"]
-    dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") * r.colSums(counts)
+    dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE")
     pandas_sf = pandas2ri.ri2py(dds)
+    pandas_cm = pandas2ri.ri2py(r.colSums(counts))
     pandas2ri.deactivate()
-    return pandas_sf
+    return pandas_sf * pandas_cm
 
 def computeSumFactors(counts):
     """ Compute normalization factors
diff --git a/stanalysis/preprocessing.py b/stanalysis/preprocessing.py
index 7dfd3f4..b6b5420 100644
--- a/stanalysis/preprocessing.py
+++ b/stanalysis/preprocessing.py
@@ -5,6 +5,7 @@
 """
 import numpy as np
 import pandas as pd
+import math
 import os
 from stanalysis.normalization import *
 
@@ -41,18 +42,18 @@ def aggregate_datatasets(counts_table_files, plot_hist=False):
     counts.fillna(0.0, inplace=True)
     return counts
   
-def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1):
+def remove_noise(counts, num_exp_genes=0.01, num_exp_spots=0.01, min_expression=1):
     """This functions remove noisy genes and spots 
     for a given data frame (Genes as columns and spots as rows).
-    - The noisy spots are removed by counting how many expressed genes > 0
-    a spot has. 
-    - The noisy genes are removed by counting how many expressed spots >= min_expression
-    a gene has. 
-    :param counts: a Pandas data frame with the counts (genes as columns)
-    :param num_exp_genes: an integer representing
-    the total number of genes that a spot must have with a count bigger
-    than 0 in order to be kept
-    :param num_exp_spots: an integer representing
+    - The noisy spots are removed so to keep a percentage
+    of the total distribution of spots whose gene counts are not 0
+    The percentage is given as a parameter.
+    - The noisy genes are removed so every gene that is expressed
+    in less than 1% of the total spots. Expressed with a count >= 2. 
+    :param counts: a Pandas data frame with the counts
+    :param num_exp_genes: a float from 0-1 representing the % of 
+    the distribution of expressed genes a spot must have to be kept
+    :param num_exp_spots: a float from 0-1 representing the % of 
     the total number of spots that a gene must have with a count bigger
     than the parameter min_expression in order to be kept
     :param min_expression: the minimum expression for a gene to be
@@ -62,20 +63,23 @@ def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1):
     
     # How many spots do we keep based on the number of genes expressed?
     num_spots = len(counts.index)
-    print "Removing spots that are expressed in less than {} genes".format(nnum_exp_genes)
-    counts = counts[(counts != 0).sum(axis=1) >= num_exp_genes]
+    num_genes = len(counts.columns)
+    min_genes_spot_exp = round((counts != 0).sum(axis=1).quantile(num_exp_genes))
+    print "Number of expressed genes a spot must have to be kept " \
+    "({}% of total expressed genes) {}".format(num_exp_genes,min_genes_spot_exp)
+    counts = counts[(counts != 0).sum(axis=1) >= min_genes_spot_exp]
     print "Dropped {} spots".format(num_spots - len(counts.index))
           
     # Spots are columns and genes are rows
     counts = counts.transpose()
   
     # Remove noisy genes
-    num_genes = len(counts.columns)
-    print "Removing genes that are expressed in less than {} spots".format(num_exp_spots)
-    counts = counts[(counts >= min_expression).sum(axis=1) >= num_exp_spots]
+    min_features_gene = round(len(counts.columns) * num_exp_spots) 
+    print "Removing genes that are expressed in less than {} " \
+    "spots with a count of at least {}".format(min_features_gene, min_expression)
+    counts = counts[(counts >= min_expression).sum(axis=1) >= min_features_gene]
     print "Dropped {} genes".format(num_genes - len(counts.index))
     
-    # return the filtered matrix
     return counts.transpose()
     
 def keep_top_genes(counts, num_genes_keep, criteria="Variance"):
@@ -86,22 +90,29 @@ def keep_top_genes(counts, num_genes_keep, criteria="Variance"):
     :param counts: a Pandas data frame with the counts
     :param num_genes_keep: the % (1-100) of genes to keep
     :param criteria: the criteria used to select ("Variance or "TopRanked")
-    :return: a new Pandas data frame with only the top raned genes. 
+    :return: a new Pandas data frame with only the top ranked genes. 
     """
     # Spots as columns and genes as rows
     counts = counts.transpose()
     # Keep only the genes with higher over-all variance
     num_genes = len(counts.index)
+    print "Removing {}% of genes based on the {}".format(num_genes_keep * 100, criteria)
     if criteria == "Variance":
         min_genes_spot_var = counts.var(axis=1).quantile(num_genes_keep)
-        print "Min normalized variance a gene must have over all spots " \
-        "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var)
-        counts = counts[counts.var(axis=1) >= min_genes_spot_var]
+        if math.isnan(min_genes_spot_var):
+            print "Computed variance is NaN! Check your normalization factors.."
+        else:
+            print "Min normalized variance a gene must have over all spots " \
+            "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var)
+            counts = counts[counts.var(axis=1) >= min_genes_spot_var]
     elif criteria == "TopRankded":
         min_genes_spot_sum = counts.sum(axis=1).quantile(num_genes_keep)
-        print "Min normalized total count a gene must have over all spots " \
-        "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum)
-        counts = counts[counts.sum(axis=1) >= min_genes_spot_var]
+        if math.isnan(min_genes_spot_var):
+            print "Computed sum is NaN! Check your normalization factors.."
+        else:
+            print "Min normalized total count a gene must have over all spots " \
+            "to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum)
+            counts = counts[counts.sum(axis=1) >= min_genes_spot_var]
     else:
         raise RunTimeError("Error, incorrect criteria method\n")  
     print "Dropped {} genes".format(num_genes - len(counts.index))    
@@ -134,6 +145,9 @@ def compute_size_factors(counts, normalization):
     if np.isnan(size_factors).any() or np.isinf(size_factors).any():
         print "Warning: Computed size factors contained NaN or Inf. The data will not be normalized!"
         size_factors = np.ones(len(size_factors))
+    elif (size_factors <= 0.0).any():
+        print "Warning: Computed size factors contained zeroes or negative values.\nThey will be replaced by epsilon!"
+        size_factors[size_factors <= 0.0] = np.finfo(np.float32).eps      
     return size_factors
 
 def normalize_data(counts, normalization, center=False, adjusted_log=False):
@@ -150,6 +164,8 @@ def normalize_data(counts, normalization, center=False, adjusted_log=False):
     """
     # Compute the size factors
     size_factors = compute_size_factors(counts, normalization)
+    if np.all(size_factors == 1.0):
+        return counts
     # Spots as columns and genes as rows
     counts = counts.transpose()
     # Center and/or adjust log the size_factors and counts