Skip to content

Commit 4a4c06d

Browse files
committed
Implement fractional dissimilarity score
1 parent 85578cc commit 4a4c06d

File tree

1 file changed

+37
-29
lines changed

1 file changed

+37
-29
lines changed

biojava-phylo/src/main/java/org/biojava/nbio/phylo/DistanceMatrixCalculator.java

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,12 @@
22

33
import java.util.List;
44

5-
import org.biojava.nbio.alignment.Alignments;
6-
import org.biojava.nbio.alignment.Alignments.PairwiseSequenceScorerType;
7-
import org.biojava.nbio.alignment.SimpleGapPenalty;
85
import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
96
import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
107
import org.biojava.nbio.core.sequence.MultipleSequenceAlignment;
118
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
129
import org.biojava.nbio.core.sequence.template.Compound;
1310
import org.biojava.nbio.core.sequence.template.Sequence;
14-
import org.biojava.nbio.core.util.ConcurrencyTools;
1511
import org.forester.evoinference.distance.PairwiseDistanceCalculator;
1612
import org.forester.evoinference.matrix.distance.BasicSymmetricalDistanceMatrix;
1713
import org.forester.evoinference.matrix.distance.DistanceMatrix;
@@ -192,22 +188,20 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percent
192188
}
193189

194190
/**
195-
* The fractional dissimilarity score (Ds) is the average character
191+
* The fractional dissimilarity score (Ds) is a relative measure of the
196192
* dissimilarity between two aligned sequences. It is calculated as:
197193
*
198194
* <pre>
199-
* Ds = sum(max(M) - M<sub>ai,bi</sub>) / L
195+
* Ds = sum( max(M) - M<sub>ai,bi</sub> ) / (max(M)-min(M)) ) / L
200196
* </pre>
201197
*
202198
* Where the sum through i runs for all the alignment positions, ai and bi
203199
* are the AA at position i in the first and second aligned sequences,
204200
* respectively, and L is the total length of the alignment (normalization).
205201
* <p>
206-
* The fractional dissimilarity score (Ds) is defined in the interval [0,
207-
* 1], where 0 means that the sequences are identical and 1 that the
208-
* sequences are completely different (similarity score of 0). Note that a
209-
* value higher than one can be obtained, if the similarity score of the
210-
* alignment is negative (big evolutionary distance).
202+
* The fractional dissimilarity score (Ds) is in the interval [0, 1], where
203+
* 0 means that the sequences are identical and 1 that the sequences are
204+
* completely different.
211205
* <p>
212206
* Gaps do not have a contribution to the similarity score calculation (gap
213207
* penalty = 0)
@@ -221,34 +215,48 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percent
221215
public static <C extends Sequence<D>, D extends Compound> DistanceMatrix fractionalDissimilarityScore(
222216
MultipleSequenceAlignment<C, D> msa, SubstitutionMatrix<D> M) {
223217

218+
// Calculate the similarity scores using the alignment package
219+
logger.info("{}:{}", "Determing Distances", 0);
220+
224221
int n = msa.getSize();
225222
DistanceMatrix DM = new BasicSymmetricalDistanceMatrix(n);
223+
int totalloopcount = (n / 2) * (n + 1);
224+
int end = msa.getLength();
226225

227-
// Calculate the similarity scores using the alignment package
228-
double[] scores = Alignments.getAllPairsScores(
229-
msa.getAlignedSequences(),
230-
PairwiseSequenceScorerType.GLOBAL_SIMILARITIES,
231-
new SimpleGapPenalty(0, 0), M);
226+
String[] sequenceString = new String[n];
227+
for (int i = 0; i < n; i++) {
228+
sequenceString[i] = msa.getAlignedSequence(i + 1)
229+
.getSequenceAsString();
230+
}
231+
List<C> seqs = msa.getAlignedSequences();
232+
233+
int loopcount = 0;
234+
for (int i = 0; i < (n - 1); i++) {
235+
logger.info("{}:{}", "Determining Distances", (loopcount * 100)
236+
/ totalloopcount);
232237

233-
ConcurrencyTools.shutdown();
238+
// Obtain the similarity scores
239+
for (int j = i; j < n; j++) {
234240

235-
// The maximum score that can be obtained for this alignment
236-
int maxScore = M.getMaxValue() * msa.getLength();
241+
double score = 0;
242+
loopcount++;
237243

238-
// Convert and copy the dissimilarity scores to the matrix
239-
int indx = 0;
240-
for (int i = 0; i < n; i++) {
241-
DM.setIdentifier(i, msa.getAlignedSequence(i + 1).getAccession()
242-
.getID());
244+
for (int k = 0; k < end; k++) {
245+
if (Comparison.isGap(sequenceString[i].charAt(k))
246+
|| Comparison.isGap(sequenceString[j].charAt(k)))
247+
continue;
248+
score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
249+
.get(j).getCompoundAt(k + 1));
250+
}
243251

244-
for (int j = i; j < n; j++) {
245252
if (i == j)
246253
DM.setValue(i, j, 0.0);
247254
else {
248-
double dS = (maxScore - scores[indx]) / msa.getLength();
255+
double dS = (M.getMaxValue() - score / msa.getLength())
256+
/ (M.getMaxValue() - M.getMinValue());
257+
249258
DM.setValue(i, j, dS);
250259
DM.setValue(j, i, dS);
251-
indx++;
252260
}
253261
}
254262
}
@@ -315,8 +323,8 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix dissimi
315323
if (Comparison.isGap(sequenceString[i].charAt(k))
316324
|| Comparison.isGap(sequenceString[j].charAt(k)))
317325
continue;
318-
score += M.getValue(seqs.get(i).getCompoundAt(k + 1),
319-
seqs.get(j).getCompoundAt(k + 1));
326+
score += M.getValue(seqs.get(i).getCompoundAt(k + 1), seqs
327+
.get(j).getCompoundAt(k + 1));
320328
}
321329

322330
if (i != j)

0 commit comments

Comments
 (0)