22
33import java .util .List ;
44
5- import org .biojava .nbio .alignment .Alignments ;
6- import org .biojava .nbio .alignment .Alignments .PairwiseSequenceScorerType ;
7- import org .biojava .nbio .alignment .SimpleGapPenalty ;
85import org .biojava .nbio .core .alignment .matrices .SubstitutionMatrixHelper ;
96import org .biojava .nbio .core .alignment .template .SubstitutionMatrix ;
107import org .biojava .nbio .core .sequence .MultipleSequenceAlignment ;
118import org .biojava .nbio .core .sequence .compound .AminoAcidCompound ;
129import org .biojava .nbio .core .sequence .template .Compound ;
1310import org .biojava .nbio .core .sequence .template .Sequence ;
14- import org .biojava .nbio .core .util .ConcurrencyTools ;
1511import org .forester .evoinference .distance .PairwiseDistanceCalculator ;
1612import org .forester .evoinference .matrix .distance .BasicSymmetricalDistanceMatrix ;
1713import org .forester .evoinference .matrix .distance .DistanceMatrix ;
@@ -192,22 +188,20 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percent
192188 }
193189
194190 /**
195- * The fractional dissimilarity score (Ds) is the average character
191+ * The fractional dissimilarity score (Ds) is a relative measure of the
196192 * dissimilarity between two aligned sequences. It is calculated as:
197193 *
198194 * <pre>
199- * Ds = sum(max(M) - M<sub>ai,bi</sub>) / L
195+ * Ds = sum( max(M) - M<sub>ai,bi</sub> ) / (max(M)-min(M)) ) / L
200196 * </pre>
201197 *
202198 * Where the sum through i runs for all the alignment positions, ai and bi
203199 * are the AA at position i in the first and second aligned sequences,
204200 * respectively, and L is the total length of the alignment (normalization).
205201 * <p>
206- * The fractional dissimilarity score (Ds) is defined in the interval [0,
207- * 1], where 0 means that the sequences are identical and 1 that the
208- * sequences are completely different (similarity score of 0). Note that a
209- * value higher than one can be obtained, if the similarity score of the
210- * alignment is negative (big evolutionary distance).
202+ * The fractional dissimilarity score (Ds) is in the interval [0, 1], where
203+ * 0 means that the sequences are identical and 1 that the sequences are
204+ * completely different.
211205 * <p>
212206 * Gaps do not have a contribution to the similarity score calculation (gap
213207 * penalty = 0)
@@ -221,34 +215,48 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix percent
221215 public static <C extends Sequence <D >, D extends Compound > DistanceMatrix fractionalDissimilarityScore (
222216 MultipleSequenceAlignment <C , D > msa , SubstitutionMatrix <D > M ) {
223217
218+ // Calculate the similarity scores using the alignment package
219+ logger .info ("{}:{}" , "Determing Distances" , 0 );
220+
224221 int n = msa .getSize ();
225222 DistanceMatrix DM = new BasicSymmetricalDistanceMatrix (n );
223+ int totalloopcount = (n / 2 ) * (n + 1 );
224+ int end = msa .getLength ();
226225
227- // Calculate the similarity scores using the alignment package
228- double [] scores = Alignments .getAllPairsScores (
229- msa .getAlignedSequences (),
230- PairwiseSequenceScorerType .GLOBAL_SIMILARITIES ,
231- new SimpleGapPenalty (0 , 0 ), M );
226+ String [] sequenceString = new String [n ];
227+ for (int i = 0 ; i < n ; i ++) {
228+ sequenceString [i ] = msa .getAlignedSequence (i + 1 )
229+ .getSequenceAsString ();
230+ }
231+ List <C > seqs = msa .getAlignedSequences ();
232+
233+ int loopcount = 0 ;
234+ for (int i = 0 ; i < (n - 1 ); i ++) {
235+ logger .info ("{}:{}" , "Determining Distances" , (loopcount * 100 )
236+ / totalloopcount );
232237
233- ConcurrencyTools .shutdown ();
238+ // Obtain the similarity scores
239+ for (int j = i ; j < n ; j ++) {
234240
235- // The maximum score that can be obtained for this alignment
236- int maxScore = M . getMaxValue () * msa . getLength () ;
241+ double score = 0 ;
242+ loopcount ++ ;
237243
238- // Convert and copy the dissimilarity scores to the matrix
239- int indx = 0 ;
240- for (int i = 0 ; i < n ; i ++) {
241- DM .setIdentifier (i , msa .getAlignedSequence (i + 1 ).getAccession ()
242- .getID ());
244+ for (int k = 0 ; k < end ; k ++) {
245+ if (Comparison .isGap (sequenceString [i ].charAt (k ))
246+ || Comparison .isGap (sequenceString [j ].charAt (k )))
247+ continue ;
248+ score += M .getValue (seqs .get (i ).getCompoundAt (k + 1 ), seqs
249+ .get (j ).getCompoundAt (k + 1 ));
250+ }
243251
244- for (int j = i ; j < n ; j ++) {
245252 if (i == j )
246253 DM .setValue (i , j , 0.0 );
247254 else {
248- double dS = (maxScore - scores [indx ]) / msa .getLength ();
255+ double dS = (M .getMaxValue () - score / msa .getLength ())
256+ / (M .getMaxValue () - M .getMinValue ());
257+
249258 DM .setValue (i , j , dS );
250259 DM .setValue (j , i , dS );
251- indx ++;
252260 }
253261 }
254262 }
@@ -315,8 +323,8 @@ public static <C extends Sequence<D>, D extends Compound> DistanceMatrix dissimi
315323 if (Comparison .isGap (sequenceString [i ].charAt (k ))
316324 || Comparison .isGap (sequenceString [j ].charAt (k )))
317325 continue ;
318- score += M .getValue (seqs .get (i ).getCompoundAt (k + 1 ),
319- seqs .get (j ).getCompoundAt (k + 1 ));
326+ score += M .getValue (seqs .get (i ).getCompoundAt (k + 1 ), seqs
327+ .get (j ).getCompoundAt (k + 1 ));
320328 }
321329
322330 if (i != j )
0 commit comments