Skip to content

Commit 3722ce5

Browse files
committed
Normalize percentage of identity by the length without gaps biojava#521
1 parent a695a5f commit 3722ce5

File tree

3 files changed

+75
-35
lines changed

3 files changed

+75
-35
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/alignment/SimpleSequencePair.java

Lines changed: 71 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,14 @@
3838
*
3939
* @author Mark Chapman
4040
* @author Paolo Pavan
41-
* @param <S> each element of the alignment {@link Profile} is of type S
42-
* @param <C> each element of an {@link AlignedSequence} is a {@link Compound} of type C
41+
* @param <S>
42+
* each element of the alignment {@link Profile} is of type S
43+
* @param <C>
44+
* each element of an {@link AlignedSequence} is a {@link Compound}
45+
* of type C
4346
*/
44-
public class SimpleSequencePair<S extends Sequence<C>, C extends Compound> extends SimpleProfile<S, C>
45-
implements SequencePair<S, C> {
46-
47+
public class SimpleSequencePair<S extends Sequence<C>, C extends Compound>
48+
extends SimpleProfile<S, C> implements SequencePair<S, C> {
4749

4850
private static final long serialVersionUID = 1L;
4951

@@ -52,22 +54,34 @@ public class SimpleSequencePair<S extends Sequence<C>, C extends Compound> exten
5254
/**
5355
* Creates a pair profile for the given already aligned sequences.
5456
*
55-
* @param query the first sequence of the pair
56-
* @param target the second sequence of the pair
57-
* @throws IllegalArgumentException if sequences differ in size
57+
* @param query
58+
* the first sequence of the pair
59+
* @param target
60+
* the second sequence of the pair
61+
* @throws IllegalArgumentException
62+
* if sequences differ in size
5863
*/
59-
public SimpleSequencePair(AlignedSequence<S, C> query, AlignedSequence<S, C> target) {
64+
public SimpleSequencePair(AlignedSequence<S, C> query,
65+
AlignedSequence<S, C> target) {
6066
super(query, target);
6167
}
6268

6369
/**
6470
* Creates a pair profile for the given sequences with a global alignment.
6571
*
66-
* @param query the first sequence of the pair
67-
* @param target the second sequence of the pair
68-
* @param sx lists whether the query sequence aligns a {@link Compound} or gap at each index of the alignment
69-
* @param sy lists whether the target sequence aligns a {@link Compound} or gap at each index of the alignment
70-
* @throws IllegalArgumentException if alignments differ in size or given sequences do not fit in alignments
72+
* @param query
73+
* the first sequence of the pair
74+
* @param target
75+
* the second sequence of the pair
76+
* @param sx
77+
* lists whether the query sequence aligns a {@link Compound} or
78+
* gap at each index of the alignment
79+
* @param sy
80+
* lists whether the target sequence aligns a {@link Compound} or
81+
* gap at each index of the alignment
82+
* @throws IllegalArgumentException
83+
* if alignments differ in size or given sequences do not fit in
84+
* alignments
7185
*/
7286
public SimpleSequencePair(S query, S target, List<Step> sx, List<Step> sy) {
7387
this(query, target, sx, 0, 0, sy, 0, 0);
@@ -76,17 +90,34 @@ public SimpleSequencePair(S query, S target, List<Step> sx, List<Step> sy) {
7690
/**
7791
* Creates a pair profile for the given sequences with a local alignment.
7892
*
79-
* @param query the first sequence of the pair
80-
* @param target the second sequence of the pair
81-
* @param sx lists whether the query sequence aligns a {@link Compound} or gap at each index of the alignment
82-
* @param xb number of {@link Compound}s skipped in the query sequence before the aligned region
83-
* @param xa number of {@link Compound}s skipped in the query sequence after the aligned region
84-
* @param sy lists whether the target sequence aligns a {@link Compound} or gap at each index of the alignment
85-
* @param yb number of {@link Compound}s skipped in the target sequence before the aligned region
86-
* @param ya number of {@link Compound}s skipped in the target sequence after the aligned region
87-
* @throws IllegalArgumentException if alignments differ in size or given sequences do not fit in alignments
93+
* @param query
94+
* the first sequence of the pair
95+
* @param target
96+
* the second sequence of the pair
97+
* @param sx
98+
* lists whether the query sequence aligns a {@link Compound} or
99+
* gap at each index of the alignment
100+
* @param xb
101+
* number of {@link Compound}s skipped in the query sequence
102+
* before the aligned region
103+
* @param xa
104+
* number of {@link Compound}s skipped in the query sequence
105+
* after the aligned region
106+
* @param sy
107+
* lists whether the target sequence aligns a {@link Compound} or
108+
* gap at each index of the alignment
109+
* @param yb
110+
* number of {@link Compound}s skipped in the target sequence
111+
* before the aligned region
112+
* @param ya
113+
* number of {@link Compound}s skipped in the target sequence
114+
* after the aligned region
115+
* @throws IllegalArgumentException
116+
* if alignments differ in size or given sequences do not fit in
117+
* alignments
88118
*/
89-
public SimpleSequencePair(S query, S target, List<Step> sx, int xb, int xa, List<Step> sy, int yb, int ya) {
119+
public SimpleSequencePair(S query, S target, List<Step> sx, int xb, int xa,
120+
List<Step> sy, int yb, int ya) {
90121
super(query, target, sx, xb, xa, sy, yb, ya);
91122
}
92123

@@ -107,7 +138,8 @@ public int getIndexInQueryAt(int alignmentIndex) {
107138

108139
@Override
109140
public int getIndexInQueryForTargetAt(int targetIndex) {
110-
return getAlignedSequence(1).getSequenceIndexAt(getAlignedSequence(2).getAlignmentIndexAt(targetIndex));
141+
return getAlignedSequence(1).getSequenceIndexAt(
142+
getAlignedSequence(2).getAlignmentIndexAt(targetIndex));
111143
}
112144

113145
@Override
@@ -117,15 +149,17 @@ public int getIndexInTargetAt(int alignmentIndex) {
117149

118150
@Override
119151
public int getIndexInTargetForQueryAt(int queryIndex) {
120-
return getAlignedSequence(2).getSequenceIndexAt(getAlignedSequence(1).getAlignmentIndexAt(queryIndex));
152+
return getAlignedSequence(2).getSequenceIndexAt(
153+
getAlignedSequence(1).getAlignmentIndexAt(queryIndex));
121154
}
122155

123156
@Override
124157
public int getNumIdenticals() {
125158
if (identicals == -1) {
126159
identicals = 0;
127160
for (int i = 1; i <= getLength(); i++) {
128-
if (getCompoundInQueryAt(i).equalsIgnoreCase(getCompoundInTargetAt(i))) {
161+
if (getCompoundInQueryAt(i).equalsIgnoreCase(
162+
getCompoundInTargetAt(i))) {
129163
identicals++;
130164
}
131165
}
@@ -144,13 +178,15 @@ public int getNumSimilars() {
144178
C c1 = getCompoundInQueryAt(i);
145179
C c2 = getCompoundInTargetAt(i);
146180

147-
if ( c1 instanceof AminoAcidCompound && c2 instanceof AminoAcidCompound) {
148-
short value = matrix.getValue((AminoAcidCompound)c1, (AminoAcidCompound)c2);
149-
if ( value > 0)
181+
if (c1 instanceof AminoAcidCompound
182+
&& c2 instanceof AminoAcidCompound) {
183+
short value = matrix.getValue((AminoAcidCompound) c1,
184+
(AminoAcidCompound) c2);
185+
if (value > 0)
150186
similars++;
151187
} else {
152188

153-
if (getCompoundSet().compoundsEquivalent(c1,c2)) {
189+
if (getCompoundSet().compoundsEquivalent(c1, c2)) {
154190
similars++;
155191
}
156192
}
@@ -174,7 +210,10 @@ public AlignedSequence<S, C> getTarget() {
174210
@Override
175211
public double getPercentageOfIdentity() {
176212
double seqid = getNumIdenticals();
177-
return seqid / getLength();
213+
double length = getLength()
214+
- getAlignedSequence(1).getNumGapPositions()
215+
- getAlignedSequence(2).getNumGapPositions();
216+
return seqid / length;
178217
}
179218

180219
}

biojava-core/src/main/java/org/biojava/nbio/core/alignment/template/SequencePair.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ public interface SequencePair<S extends Sequence<C>, C extends Compound> extends
9999

100100
/**
101101
* Returns the percentage of identity between the two sequences in the alignment as a fraction between 0 and 1.
102-
* This is equivalent to {@link #getNumIdenticals()} / {@link #getLength()}.
102+
* This is equivalent to {@link #getNumIdenticals()} / {@link #getLength()}. Gap positions are exluded
103+
* from the calculation.
103104
*
104105
* @return the percentage of sequence identity as a fraction in [0,1]
105106
*/

biojava-core/src/test/java/org/biojava/nbio/core/alignment/SimpleSequencePairTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,8 @@ public void testGetNumIdenticals() {
250250

251251
@Test
252252
public void testGetPercentageOfIdentity() {
253-
assertEquals(global.getPercentageOfIdentity(), 0.4, 0.01);
254-
assertEquals(local.getPercentageOfIdentity(), 0.66, 0.01);
253+
assertEquals(global.getPercentageOfIdentity(), 1.0, 0.01);
254+
assertEquals(local.getPercentageOfIdentity(), 1.0, 0.01);
255255
}
256256

257257
@Test

0 commit comments

Comments
 (0)