Skip to content

Commit f3515a7

Browse files
committed
Introduced optimization: now if switch is provided subunit clustering uses entity id infor.
Some tests are failing
1 parent 568a8d4 commit f3515a7

File tree

3 files changed

+94
-2
lines changed

3 files changed

+94
-2
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.biojava.nbio.core.sequence.ProteinSequence;
3232
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
3333
import org.biojava.nbio.structure.Atom;
34+
import org.biojava.nbio.structure.Chain;
35+
import org.biojava.nbio.structure.Structure;
3436
import org.biojava.nbio.structure.StructureException;
3537
import org.biojava.nbio.structure.align.StructureAlignment;
3638
import org.biojava.nbio.structure.align.StructureAlignmentFactory;
@@ -45,6 +47,7 @@
4547
import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl;
4648
import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer;
4749
import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer;
50+
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
4851
import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits;
4952
import org.biojava.nbio.structure.symmetry.internal.CESymmParameters;
5053
import org.biojava.nbio.structure.symmetry.internal.CeSymm;
@@ -178,6 +181,38 @@ public boolean isIdenticalTo(SubunitCluster other) {
178181
return thisSequence.equals(otherSequence);
179182
}
180183

184+
/**
185+
* Tells whether the other SubunitCluster contains exactly the same Subunit.
186+
* This is checked by equality of their entity identifiers if they are present.
187+
*
188+
* @param other
189+
* SubunitCluster
190+
* @return true if the SubunitClusters are identical, false otherwise
191+
*/
192+
public boolean isIdenticalByEntityIdTo(SubunitCluster other) {
193+
Structure thisStruct = this.subunits.get(this.representative).getStructure();
194+
Structure otherStruct = other.subunits.get(other.representative).getStructure();
195+
String thisName = this.subunits.get(this.representative).getName();
196+
String otherName = other.subunits.get(this.representative).getName();
197+
Chain thisChain = thisStruct.getChain(thisName);
198+
Chain otherChain = otherStruct.getChain(otherName);
199+
if (thisChain == null || otherChain == null) {
200+
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
201+
this.subunits.get(this.representative).getName(),
202+
other.subunits.get(other.representative).getName());
203+
return false;
204+
}
205+
if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) {
206+
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
207+
this.subunits.get(this.representative).getName(),
208+
other.subunits.get(other.representative).getName());
209+
return false;
210+
}
211+
int thisEntityId = thisChain.getEntityInfo().getMolId();
212+
int otherEntityId = otherChain.getEntityInfo().getMolId();
213+
return thisEntityId == otherEntityId;
214+
}
215+
181216
/**
182217
* Merges the other SubunitCluster into this one if it contains exactly the
183218
* same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}.
@@ -191,7 +226,35 @@ public boolean mergeIdentical(SubunitCluster other) {
191226
if (!isIdenticalTo(other))
192227
return false;
193228

194-
logger.info("SubunitClusters are identical");
229+
logger.info("SubunitClusters {}-{} are identical in sequence",
230+
this.subunits.get(this.representative).getName(),
231+
other.subunits.get(other.representative).getName());
232+
233+
this.subunits.addAll(other.subunits);
234+
this.subunitEQR.addAll(other.subunitEQR);
235+
236+
return true;
237+
}
238+
239+
/**
240+
* Merges the other SubunitCluster into this one if it contains exactly the
241+
* same Subunit. This is checked by comparing the entity identifiers of the subunits
242+
* if one can be found.
243+
* Thus this only makes sense when the subunits are complete chains of a
244+
* deposited PDB entry. I
245+
*
246+
* @param other
247+
* SubunitCluster
248+
* @return true if the SubunitClusters were merged, false otherwise
249+
*/
250+
public boolean mergeIdenticalByEntityId(SubunitCluster other) {
251+
252+
if (!isIdenticalByEntityIdTo(other))
253+
return false;
254+
255+
logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical",
256+
this.subunits.get(this.representative).getName(),
257+
other.subunits.get(other.representative).getName());
195258

196259
this.subunits.addAll(other.subunits);
197260
this.subunitEQR.addAll(other.subunitEQR);

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,18 @@ public static Stoichiometry cluster(List<Subunit> subunits, SubunitClustererPara
7171
for (int c1 = 0; c1 < clusters.size(); c1++) {
7272
for (int c2 = clusters.size() - 1; c2 > c1; c2--) {
7373
try {
74-
if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
74+
if (params.isUseEntityIdForSeqIdentityDetermination() &&
75+
clusters.get(c1).mergeIdenticalByEntityId(clusters.get(c2))) {
76+
// This we will only do if the switch is for entity id comparison is on.
77+
// In some cases in can save enormous amounts of time, e.g. for clustering full
78+
// chains of deposited PDB entries. For instance for 6NHJ: with pure alignments it
79+
// takes ~ 6 hours, with entity id comparisons it takes 2 minutes.
80+
clusters.remove(c2);
81+
} else if (clusters.get(c1).mergeIdentical(clusters.get(c2))) {
82+
// This always makes sense as an optimization: it's far cheaper to compare the sequence
83+
// string than doing a full S-W alignment
84+
clusters.remove(c2);
85+
} else if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) {
7586
clusters.remove(c2);
7687
}
7788

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ public class SubunitClustererParameters implements Serializable {
4545
private double sequenceIdentityThreshold;
4646
private double sequenceCoverageThreshold = 0.75;
4747

48+
private boolean useEntityIdForSeqIdentityDetermination = false;
49+
4850
private double rmsdThreshold = 3.0;
4951
private double structureCoverageThreshold = 0.75;
5052
private double tmThreshold = 0.5;
@@ -506,5 +508,21 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo
506508
return sequenceIdentity>=hcSequenceIdentityLocal && sequenceCoverage >= hcSequenceCoverageLocal;
507509
}
508510

511+
/**
512+
* Whether to use the entity id of subunits to infer that sequences are identical.
513+
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
514+
* @return
515+
*/
516+
public boolean isUseEntityIdForSeqIdentityDetermination() {
517+
return useEntityIdForSeqIdentityDetermination;
518+
}
509519

520+
/**
521+
* Whether to use the entity id of subunits to infer that sequences are identical.
522+
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
523+
* @param useEntityIdForSeqIdentityDetermination the flag to be set
524+
*/
525+
public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) {
526+
this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination;
527+
}
510528
}

0 commit comments

Comments
 (0)