Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ Version 0.3.3

Version 0.4.0
* Removed deprecated scripts
* Adjusted some formats and parameters
* Adjusted some formats and parameters

Version 0.4.1
* Fixed a bug in the noise filtering function
17 changes: 1 addition & 16 deletions scripts/differential_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,6 @@
from rpy2.robjects import pandas2ri, r, numpy2ri
robjects.conversion.py2ri = numpy2ri
import matplotlib.pyplot as plt

def get_classes_coordinate(class_file):
""" Helper function
to get a dictionary of spot -> class
from a tab delimited file
"""
barcodes_classes = dict()
with open(class_file, "r") as filehandler:
for line in filehandler.readlines():
tokens = line.split()
assert(len(tokens) == 2)
spot = tokens[1]
class_label = tokens[0]
barcodes_classes[spot] = class_label
return barcodes_classes

def dea(counts, conds, size_factors=None):
"""Makes a call to DESeq2 to
Expand Down Expand Up @@ -114,7 +99,7 @@ def main(counts_table_files, data_classes,
for line in filehandler.readlines():
tokens = line.split()
assert(len(tokens) == 2)
spot_classes["{}_{}".format(i,tokens[1])] = str(tokens[0])
spot_classes["{}_{}".format(i,tokens[0])] = str(tokens[1])

# Remove noise
counts = remove_noise(counts, 0.01, 0.01, 1)
Expand Down
23 changes: 11 additions & 12 deletions scripts/unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def main(counts_table_files,
counts = aggregate_datatasets(counts_table_files)

# Remove noisy spots and genes (Spots are rows and genes are columns)
counts = remove_noise(counts, num_exp_genes, num_exp_spots)
counts = remove_noise(counts, num_exp_genes / 100.0, num_exp_spots / 100.0)

if num_clusters is None:
num_clusters = computeNClusters(counts)
Expand All @@ -137,10 +137,9 @@ def main(counts_table_files,
center_size_factors = not use_adjusted_log
norm_counts = normalize_data(counts, normalization,
center=center_size_factors, adjusted_log=use_adjusted_log)

# Keep top genes (variance or expressed)
num_genes_keep = num_genes_keep / 100.0
norm_counts = keep_top_genes(norm_counts, num_genes_keep, criteria=top_genes_criteria)
norm_counts = keep_top_genes(norm_counts, num_genes_keep / 100.0, criteria=top_genes_criteria)

if use_log_scale:
print "Using pseudo-log counts log2(counts + 1)"
Expand Down Expand Up @@ -196,7 +195,7 @@ def main(counts_table_files,
y = float(tokens[1])
x = float(tokens[0].split("_")[1])
index = int(tokens[0].split("_")[0])
file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y)), labels[i])
file_writers[index].write("{0}\t{1}\n".format("{}x{}".format(x,y), labels[i]))

# Compute a color_label based on the RGB representation of the 3D dimensionality reduced
labels_colors = list()
Expand Down Expand Up @@ -315,12 +314,12 @@ def main(counts_table_files,
parser.add_argument("--num-clusters", default=None, metavar="[INT]", type=int, choices=range(2, 10),
help="The number of clusters/regions expected to be found.\n" \
"If not given the number of clusters will be computed.")
parser.add_argument("--num-exp-genes", default=5, metavar="[INT]", type=int, choices=range(1, 100),
help="The number of expressed genes (!= 0) a spot\n" \
"must have to be kept (default: %(default)s)")
parser.add_argument("--num-exp-spots", default=5, metavar="[INT]", type=int, choices=range(1, 100),
help="The number of expressed spots (!= 0) a gene " \
"must have to be kept (default: %(default)s)")
parser.add_argument("--num-exp-genes", default=10, metavar="[INT]", type=int, choices=range(0, 100),
help="The percentage of number of expressed genes (!= 0) a spot\n" \
"must have to be kept from the distribution of all expressed genes (default: %(default)s)")
parser.add_argument("--num-exp-spots", default=1, metavar="[INT]", type=int, choices=range(0, 100),
help="The percentage of number of expressed spots (!= 0) a gene " \
"must have to be kept from the total number of spots (default: %(default)s)")
parser.add_argument("--num-genes-keep", default=20, metavar="[INT]", type=int, choices=range(0, 100),
help="The percentage of genes to discard from the distribution of all the genes\n" \
"across all the spots using the variance or the top expression " \
Expand Down Expand Up @@ -356,7 +355,7 @@ def main(counts_table_files,
"the dimensionality reduction (Variance or TopRanked) (default: %(default)s)")
parser.add_argument("--use-adjusted-log", action="store_true", default=False,
help="Use adjusted log normalized counts (R Scater::normalized()) "
"in the dimensionality reduction step")
"in the dimensionality reduction step (recommended with SCRAN normalization)")
parser.add_argument("--outdir", default=None, help="Path to output dir")
args = parser.parse_args()
main(args.counts_table_files,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

setup(
name = 'stanalysis',
version = "0.4.0",
version = "0.4.1",
description = __doc__.split("\n", 1)[0],
long_description = long_description,
keywords = 'rna-seq analysis machine_learning spatial transcriptomics toolkit',
Expand Down
10 changes: 6 additions & 4 deletions stanalysis/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def computeTMMFactors(counts):
r_counts = pandas2ri.py2ri(counts)
edger = RimportLibrary("edgeR")
as_matrix = r["as.matrix"]
dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM") * r.colSums(counts)
dds = edger.calcNormFactors(as_matrix(r_counts), method="TMM")
pandas_sf = pandas2ri.ri2py(dds)
pandas_cm = pandas2ri.ri2py(r.colSums(counts))
pandas2ri.deactivate()
return pandas_sf
return pandas_sf * pandas_cm

def computeRLEFactors(counts):
""" Compute normalization size factors
Expand All @@ -45,10 +46,11 @@ def computeRLEFactors(counts):
r_counts = pandas2ri.py2ri(counts)
edger = RimportLibrary("edgeR")
as_matrix = r["as.matrix"]
dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") * r.colSums(counts)
dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE")
pandas_sf = pandas2ri.ri2py(dds)
pandas_cm = pandas2ri.ri2py(r.colSums(counts))
pandas2ri.deactivate()
return pandas_sf
return pandas_sf * pandas_cm

def computeSumFactors(counts):
""" Compute normalization factors
Expand Down
62 changes: 39 additions & 23 deletions stanalysis/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import numpy as np
import pandas as pd
import math
import os
from stanalysis.normalization import *

Expand Down Expand Up @@ -41,18 +42,18 @@ def aggregate_datatasets(counts_table_files, plot_hist=False):
counts.fillna(0.0, inplace=True)
return counts

def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1):
def remove_noise(counts, num_exp_genes=0.01, num_exp_spots=0.01, min_expression=1):
"""This functions remove noisy genes and spots
for a given data frame (Genes as columns and spots as rows).
- The noisy spots are removed by counting how many expressed genes > 0
a spot has.
- The noisy genes are removed by counting how many expressed spots >= min_expression
a gene has.
:param counts: a Pandas data frame with the counts (genes as columns)
:param num_exp_genes: an integer representing
the total number of genes that a spot must have with a count bigger
than 0 in order to be kept
:param num_exp_spots: an integer representing
- The noisy spots are removed so to keep a percentage
of the total distribution of spots whose gene counts are not 0
The percentage is given as a parameter.
- The noisy genes are removed so every gene that is expressed
in less than 1% of the total spots. Expressed with a count >= 2.
:param counts: a Pandas data frame with the counts
:param num_exp_genes: a float from 0-1 representing the % of
the distribution of expressed genes a spot must have to be kept
:param num_exp_spots: a float from 0-1 representing the % of
the total number of spots that a gene must have with a count bigger
than the parameter min_expression in order to be kept
:param min_expression: the minimum expression for a gene to be
Expand All @@ -62,20 +63,23 @@ def remove_noise(counts, num_exp_genes=5, num_exp_spots=5, min_expression=1):

# How many spots do we keep based on the number of genes expressed?
num_spots = len(counts.index)
print "Removing spots that are expressed in less than {} genes".format(nnum_exp_genes)
counts = counts[(counts != 0).sum(axis=1) >= num_exp_genes]
num_genes = len(counts.columns)
min_genes_spot_exp = round((counts != 0).sum(axis=1).quantile(num_exp_genes))
print "Number of expressed genes a spot must have to be kept " \
"({}% of total expressed genes) {}".format(num_exp_genes,min_genes_spot_exp)
counts = counts[(counts != 0).sum(axis=1) >= min_genes_spot_exp]
print "Dropped {} spots".format(num_spots - len(counts.index))

# Spots are columns and genes are rows
counts = counts.transpose()

# Remove noisy genes
num_genes = len(counts.columns)
print "Removing genes that are expressed in less than {} spots".format(num_exp_spots)
counts = counts[(counts >= min_expression).sum(axis=1) >= num_exp_spots]
min_features_gene = round(len(counts.columns) * num_exp_spots)
print "Removing genes that are expressed in less than {} " \
"spots with a count of at least {}".format(min_features_gene, min_expression)
counts = counts[(counts >= min_expression).sum(axis=1) >= min_features_gene]
print "Dropped {} genes".format(num_genes - len(counts.index))

# return the filtered matrix
return counts.transpose()

def keep_top_genes(counts, num_genes_keep, criteria="Variance"):
Expand All @@ -86,22 +90,29 @@ def keep_top_genes(counts, num_genes_keep, criteria="Variance"):
:param counts: a Pandas data frame with the counts
:param num_genes_keep: the % (1-100) of genes to keep
:param criteria: the criteria used to select ("Variance or "TopRanked")
:return: a new Pandas data frame with only the top raned genes.
:return: a new Pandas data frame with only the top ranked genes.
"""
# Spots as columns and genes as rows
counts = counts.transpose()
# Keep only the genes with higher over-all variance
num_genes = len(counts.index)
print "Removing {}% of genes based on the {}".format(num_genes_keep * 100, criteria)
if criteria == "Variance":
min_genes_spot_var = counts.var(axis=1).quantile(num_genes_keep)
print "Min normalized variance a gene must have over all spots " \
"to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var)
counts = counts[counts.var(axis=1) >= min_genes_spot_var]
if math.isnan(min_genes_spot_var):
print "Computed variance is NaN! Check your normalization factors.."
else:
print "Min normalized variance a gene must have over all spots " \
"to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_var)
counts = counts[counts.var(axis=1) >= min_genes_spot_var]
elif criteria == "TopRankded":
min_genes_spot_sum = counts.sum(axis=1).quantile(num_genes_keep)
print "Min normalized total count a gene must have over all spots " \
"to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum)
counts = counts[counts.sum(axis=1) >= min_genes_spot_var]
if math.isnan(min_genes_spot_var):
print "Computed sum is NaN! Check your normalization factors.."
else:
print "Min normalized total count a gene must have over all spots " \
"to be kept ({0}% of total) {1}".format(num_genes_keep, min_genes_spot_sum)
counts = counts[counts.sum(axis=1) >= min_genes_spot_var]
else:
raise RunTimeError("Error, incorrect criteria method\n")
print "Dropped {} genes".format(num_genes - len(counts.index))
Expand Down Expand Up @@ -134,6 +145,9 @@ def compute_size_factors(counts, normalization):
if np.isnan(size_factors).any() or np.isinf(size_factors).any():
print "Warning: Computed size factors contained NaN or Inf. The data will not be normalized!"
size_factors = np.ones(len(size_factors))
elif (size_factors <= 0.0).any():
print "Warning: Computed size factors contained zeroes or negative values.\nThey will be replaced by epsilon!"
size_factors[size_factors <= 0.0] = np.finfo(np.float32).eps
return size_factors

def normalize_data(counts, normalization, center=False, adjusted_log=False):
Expand All @@ -150,6 +164,8 @@ def normalize_data(counts, normalization, center=False, adjusted_log=False):
"""
# Compute the size factors
size_factors = compute_size_factors(counts, normalization)
if np.all(size_factors == 1.0):
return counts
# Spots as columns and genes as rows
counts = counts.transpose()
# Center and/or adjust log the size_factors and counts
Expand Down