forked from jfnavarro/st_analysis
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathunsupervised.py
More file actions
312 lines (279 loc) · 15.4 KB
/
Copy pathunsupervised.py
File metadata and controls
312 lines (279 loc) · 15.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""
A script that does unsupervised
classification on single cell data (Mainly used for Spatial Transcriptomics)
It takes a list of data frames as input and outputs :
- the filtered aggregated data frame
- a scatter plot with the predicted classes for each spot
- a file with the predicted classes for each spot (tab delimited)
- a scatter plot for each dataset of the spots and the predicted color
(if images are given, the images are used as background)
The input data frames must have the gene names as columns and
the spots coordinates as rows (1x1).
The spots in the output file will have the index of the dataset
appended. For instance if two datasets are given the indexes will
be (1 and 2).
The user can select what clustering algorithm to use
and what dimensionality reduction technique to use.
Noisy spots (very few genes expressed) are removed using a parameter.
Noisy genes (expressed in very few spots) are removed using a parameter.
@Author Jose Fernandez Navarro <jose.fernandez.navarro@scilifelab.se>
"""
import argparse
import sys
import os
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, FastICA, SparsePCA
#from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
#from sklearn.preprocessing import scale
from stanalysis.visualization import scatter_plot, scatter_plot3d, histogram
from stanalysis.preprocessing import *
from stanalysis.alignment import parseAlignmentMatrix
import matplotlib.pyplot as plt
def linear_conv(old, min, max, new_min, new_max):
return ((old - min) / (max - min)) * ((new_max - new_min) + new_min)
def main(counts_table_files,
normalization,
num_clusters,
clustering_algorithm,
dimensionality_algorithm,
use_log_scale,
apply_sample_normalization,
num_exp_genes,
num_genes_keep,
outdir,
alignment_files,
image_files,
num_dimensions,
spot_size,
top_genes_criteria):
if len(counts_table_files) == 0 or \
any([not os.path.isfile(f) for f in counts_table_files]):
sys.stderr.write("Error, input file/s not present or invalid format\n")
sys.exit(1)
if image_files is not None and len(image_files) > 0 and \
len(image_files) != len(counts_table_files):
sys.stderr.write("Error, the number of images given as " \
"input is not the same as the number of datasets\n")
sys.exit(1)
if alignment_files is not None and len(alignment_files) > 0 \
and len(alignment_files) != len(image_files):
sys.stderr.write("Error, the number of alignments given as " \
"input is not the same as the number of images\n")
sys.exit(1)
if outdir is None or not os.path.isdir(outdir):
outdir = os.getcwd()
outdir = os.path.abspath(outdir)
num_exp_genes = num_exp_genes / 100.0
num_genes_keep = num_genes_keep / 100.0
# Merge input datasets (Spots are rows and genes are columns)
counts = aggregate_datatasets(counts_table_files)
# Per sample normalization (Spots are rows and genes are columns)
if apply_sample_normalization and len(counts_table_files) > 1:
print "Computing per sample normalization..."
counts = normalize_samples(counts)
# Remove noisy spots and genes (Spots are rows and genes are columns)
counts = remove_noise(counts, num_exp_genes)
# Write un-normalized filtered counts to a file (for downstream analysis)
counts.to_csv(os.path.join(outdir, "aggregated_filtered_counts.tsv"), sep="\t")
# Normalize data
print "Computing per spot normalization..."
norm_counts = normalize_data(counts, normalization)
# Keep top genes (variance or expressed)
norm_counts = keep_top_genes(norm_counts, num_genes_keep, criteria=top_genes_criteria)
if "tSNE" in dimensionality_algorithm:
# method = barnes_hut or exact(slower)
# init = pca or random
# random_state = None or number
# metric = euclidean or any other
# n_components = 2 is default
decomp_model = TSNE(n_components=num_dimensions, random_state=None, perplexity=30,
early_exaggeration=4.0, learning_rate=1000, n_iter=1000,
n_iter_without_progress=30, metric="euclidean", init="pca",
method="exact", angle=0.5, verbose=0)
elif "PCA" in dimensionality_algorithm:
# n_components = None, number of mle to estimate optimal
decomp_model = PCA(n_components=num_dimensions, whiten=True, copy=True)
elif "ICA" in dimensionality_algorithm:
decomp_model = FastICA(n_components=num_dimensions,
algorithm='parallel', whiten=True,
fun='logcosh', w_init=None, random_state=None)
elif "SPCA" in dimensionality_algorithm:
decomp_model = SparsePCA(n_components=num_dimensions, alpha=1)
else:
sys.stderr.write("Error, incorrect dimensionality reduction method\n")
sys.exit(1)
if use_log_scale:
print "Using pseudo-log counts log2(counts + 1)"
norm_counts = np.log2(norm_counts + 1)
print "Performing dimensionality reduction..."
# Perform dimensionality reduction, outputs a bunch of 2D coordinates
reduced_data = decomp_model.fit_transform(norm_counts)
# Do clustering of the dimensionality reduced coordinates
if "KMeans" in clustering_algorithm:
clustering = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
elif "Hierarchical" in clustering_algorithm:
clustering = AgglomerativeClustering(n_clusters=num_clusters,
affinity='euclidean',
n_components=None, linkage='ward')
else:
sys.stderr.write("Error, incorrect clustering method\n")
sys.exit(1)
print "Performing clustering..."
# Obtain predicted classes for each spot
labels = clustering.fit_predict(reduced_data)
if 0 in labels: labels = labels + 1
# Check the number of predicted labels is correct
assert(len(labels) == len(norm_counts.index))
# Compute a color_label based on the RGB representation of the 3D dimensionality reduced
labels_colors = list()
x_max = max(reduced_data[:,0])
x_min = min(reduced_data[:,0])
y_max = max(reduced_data[:,1])
y_min = min(reduced_data[:,1])
x_p = reduced_data[:,0]
y_p = reduced_data[:,1]
z_p = y_p
if num_dimensions == 3:
z_p = reduced_data[:,2]
z_max = max(reduced_data[:,2])
z_min = min(reduced_data[:,2])
for x,y,z in zip(x_p,y_p,z_p):
r = linear_conv(x, x_min, x_max, 0.0, 1.0)
g = linear_conv(y, y_min, y_max, 0.0, 1.0)
b = linear_conv(z, z_min, z_max, 0.0, 1.0) if num_dimensions == 3 else 1.0
labels_colors.append((r,g,b))
print "Generating plots..."
# Plot the clustered spots with the class color
if num_dimensions == 3:
scatter_plot3d(x_points=reduced_data[:,0],
y_points=reduced_data[:,1],
z_points=reduced_data[:,2],
colors=labels,
output=os.path.join(outdir,"computed_classes.png"),
title='Computed classes',
alpha=1.0,
size=100)
else:
scatter_plot(x_points=reduced_data[:,0],
y_points=reduced_data[:,1],
colors=labels,
output=os.path.join(outdir,"computed_classes.png"),
title='Computed classes',
alpha=1.0,
size=80)
# Plot the spots with colors corresponding to the predicted class
# Use the HE image as background if the image is given
tot_spots = norm_counts.index
with open(os.path.join(outdir,"computed_classes.txt"), "w") as filehandler:
for i in xrange(len(counts_table_files)):
# Get the list of spot coordinates and colors to plot for each dataset
x_points = list()
y_points = list()
colors_classes = list()
colors_dimensionality = list()
for j,spot in enumerate(tot_spots):
if spot.startswith("{}_".format(i)):
# spot is i_XxY
tokens = spot.split("x")
assert(len(tokens) == 2)
y = float(tokens[1])
x = float(tokens[0].split("_")[1])
x_points.append(x)
y_points.append(y)
colors_classes.append(labels[j])
colors_dimensionality.append(labels_colors[j])
# Write the predicted classes to a file (CLASS SPOT)
filehandler.write("{0}\t{1}\n".format(labels[j], spot))
# Retrieve alignment matrix and image if any
image = image_files[i] if image_files is not None \
and len(image_files) >= i else None
alignment = alignment_files[i] if alignment_files is not None \
and len(alignment_files) >= i else None
# alignment_matrix will be identity if alignment file is None
alignment_matrix = parseAlignmentMatrix(alignment)
scatter_plot(x_points=x_points,
y_points=y_points,
colors=colors_classes,
output=os.path.join(outdir,"computed_classes_tissue_{}.png".format(i)),
alignment=alignment_matrix,
cmap=None,
title='Computed classes tissue',
xlabel='X',
ylabel='Y',
image=image,
alpha=1.0,
size=spot_size)
scatter_plot(x_points=x_points,
y_points=y_points,
colors=colors_dimensionality,
output=os.path.join(outdir,"dimensionality_color_tissue_{}.png".format(i)),
alignment=alignment_matrix,
cmap=plt.get_cmap("hsv"),
title='Dimensionality color tissue',
xlabel='X',
ylabel='Y',
image=image,
alpha=1.0,
size=spot_size)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--counts-table-files", required=True, nargs='+', type=str,
help="One or more matrices with gene counts per feature/spot (genes as columns)")
parser.add_argument("--normalization", default="DESeq", metavar="[STR]",
type=str, choices=["RAW", "DESeq", "DESeq2", "DESeq2Log", "EdgeR", "REL"],
help="Normalize the counts using RAW(absolute counts) , " \
"DESeq, DESeq2, DESeq2Log, EdgeR and " \
"REL(relative counts, each gene count divided by the total count of its spot) (default: %(default)s)")
parser.add_argument("--num-clusters", default=3, metavar="[INT]", type=int, choices=range(2, 10),
help="The number of clusters/regions expected to be found. (default: %(default)s)")
parser.add_argument("--num-exp-genes", default=10, metavar="[INT]", type=int, choices=range(0, 100),
help="The percentage of number of expressed genes ( != 0 ) a spot " \
"must have to be kept from the distribution of all expressed genes (default: %(default)s)")
parser.add_argument("--num-genes-keep", default=20, metavar="[INT]", type=int, choices=range(0, 100),
help="The percentage of top genes to discard from the distribution of all the genes " \
"across all the spots (default: %(default)s)")
parser.add_argument("--clustering", default="KMeans", metavar="[STR]",
type=str, choices=["Hierarchical", "KMeans"],
help="What clustering algorithm to use after the dimensionality reduction " \
"(Hierarchical - KMeans) (default: %(default)s)")
parser.add_argument("--dimensionality", default="ICA", metavar="[STR]",
type=str, choices=["tSNE", "PCA", "ICA", "SPCA"],
help="What dimensionality reduction algorithm to use " \
"(tSNE - PCA - ICA - SPCA) (default: %(default)s)")
parser.add_argument("--use-log-scale", action="store_true", default=False,
help="Use log values in the dimensionality reduction step")
parser.add_argument("--normalize-samples", action="store_true", default=False,
help="When multiple datasets given this option computes normalization " \
"factors for each gene using DESeq on the different samples")
parser.add_argument("--alignment-files", default=None, nargs='+', type=str,
help="One or more tab delimited files containing and alignment matrix for the images " \
"(array coordinates to pixel coordinates) as a 3x3 matrix in one row.\n" \
"Only useful is the image has extra borders, for instance not cropped to the array corners" \
"or if you want the keep the original image size in the plots.")
parser.add_argument("--image-files", default=None, nargs='+', type=str,
help="When given the data will plotted on top of the image, " \
"It can be one ore more, ideally one for each input dataset\n " \
"It is desirable that the image is cropped to the array " \
"corners otherwise an alignment file is needed")
parser.add_argument("--num-dimensions", default=3, metavar="[INT]", type=int, choices=[2,3],
help="The number of dimensions to use in the dimensionality reduction (2 or 3). (default: %(default)s)")
parser.add_argument("--spot-size", default=100, metavar="[INT]", type=int, choices=range(10, 500),
help="The size of the spots when generating the plots. (default: %(default)s)")
parser.add_argument("--top-genes-criteria", default="Variance", metavar="[STR]",
type=str, choices=["Variance", "TopRankded"],
help="What criteria to use to keep top genes before doing " \
"the dimensionality reduction (Variance or TopRanked) (default: %(default)s)")
parser.add_argument("--outdir", default=None, help="Path to output dir")
args = parser.parse_args()
main(args.counts_table_files, args.normalization, int(args.num_clusters),
args.clustering, args.dimensionality, args.use_log_scale,
args.normalize_samples, args.num_exp_genes, args.num_genes_keep, args.outdir,
args.alignment_files, args.image_files, args.num_dimensions, args.spot_size,
args.top_genes_criteria)