diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java index 03d8a97749..2ae2ccb83c 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java @@ -31,6 +31,8 @@ import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.structure.Atom; +import org.biojava.nbio.structure.Chain; +import org.biojava.nbio.structure.Structure; import org.biojava.nbio.structure.StructureException; import org.biojava.nbio.structure.align.StructureAlignment; import org.biojava.nbio.structure.align.StructureAlignmentFactory; @@ -45,6 +47,7 @@ import org.biojava.nbio.structure.align.multiple.MultipleAlignmentImpl; import org.biojava.nbio.structure.align.multiple.util.MultipleAlignmentScorer; import org.biojava.nbio.structure.align.multiple.util.ReferenceSuperimposer; +import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; import org.biojava.nbio.structure.symmetry.core.QuatSymmetrySubunits; import org.biojava.nbio.structure.symmetry.internal.CESymmParameters; import org.biojava.nbio.structure.symmetry.internal.CeSymm; @@ -71,11 +74,10 @@ */ public class SubunitCluster { - private static final Logger logger = LoggerFactory - .getLogger(SubunitCluster.class); + private static final Logger logger = LoggerFactory.getLogger(SubunitCluster.class); - private List subunits = new ArrayList(); - private List> subunitEQR = new ArrayList>(); + private List subunits = new ArrayList<>(); + private List> subunitEQR = new ArrayList<>(); private int representative = -1; private SubunitClustererMethod method = SubunitClustererMethod.SEQUENCE; @@ -119,7 +121,7 @@ public SubunitCluster(Subunit subunit) { subunits.add(subunit); - List identity = new ArrayList(); + List identity = new ArrayList<>(); for (int i = 0; i < subunit.size(); i++) identity.add(i); subunitEQR.add(identity); @@ -179,6 +181,38 @@ public boolean isIdenticalTo(SubunitCluster other) { return thisSequence.equals(otherSequence); } + /** + * Tells whether the other SubunitCluster contains exactly the same Subunit. + * This is checked by equality of their entity identifiers if they are present. + * + * @param other + * SubunitCluster + * @return true if the SubunitClusters are identical, false otherwise + */ + public boolean isIdenticalByEntityIdTo(SubunitCluster other) { + Structure thisStruct = this.subunits.get(this.representative).getStructure(); + Structure otherStruct = other.subunits.get(other.representative).getStructure(); + String thisName = this.subunits.get(this.representative).getName(); + String otherName = other.subunits.get(this.representative).getName(); + Chain thisChain = thisStruct.getChain(thisName); + Chain otherChain = otherStruct.getChain(otherName); + if (thisChain == null || otherChain == null) { + logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id", + this.subunits.get(this.representative).getName(), + other.subunits.get(other.representative).getName()); + return false; + } + if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) { + logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id", + this.subunits.get(this.representative).getName(), + other.subunits.get(other.representative).getName()); + return false; + } + int thisEntityId = thisChain.getEntityInfo().getMolId(); + int otherEntityId = otherChain.getEntityInfo().getMolId(); + return thisEntityId == otherEntityId; + } + /** * Merges the other SubunitCluster into this one if it contains exactly the * same Subunit. This is checked by {@link #isIdenticalTo(SubunitCluster)}. @@ -192,7 +226,35 @@ public boolean mergeIdentical(SubunitCluster other) { if (!isIdenticalTo(other)) return false; - logger.info("SubunitClusters are identical"); + logger.info("SubunitClusters {}-{} are identical in sequence", + this.subunits.get(this.representative).getName(), + other.subunits.get(other.representative).getName()); + + this.subunits.addAll(other.subunits); + this.subunitEQR.addAll(other.subunitEQR); + + return true; + } + + /** + * Merges the other SubunitCluster into this one if it contains exactly the + * same Subunit. This is checked by comparing the entity identifiers of the subunits + * if one can be found. + * Thus this only makes sense when the subunits are complete chains of a + * deposited PDB entry. I + * + * @param other + * SubunitCluster + * @return true if the SubunitClusters were merged, false otherwise + */ + public boolean mergeIdenticalByEntityId(SubunitCluster other) { + + if (!isIdenticalByEntityIdTo(other)) + return false; + + logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical", + this.subunits.get(this.representative).getName(), + other.subunits.get(other.representative).getName()); this.subunits.addAll(other.subunits); this.subunitEQR.addAll(other.subunitEQR); @@ -296,13 +358,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa return false; } - logger.info(String.format("SubunitClusters are similar in sequence " - + "with %.2f sequence identity and %.2f coverage", sequenceIdentity, - sequenceCoverage)); + logger.info(String.format("SubunitClusters %s-%s are similar in sequence " + + "with %.2f sequence identity and %.2f coverage", + this.subunits.get(this.representative).getName(), + other.subunits.get(other.representative).getName(), + sequenceIdentity, sequenceCoverage)); // If coverage and sequence identity sufficient, merge other and this - List thisAligned = new ArrayList(); - List otherAligned = new ArrayList(); + List thisAligned = new ArrayList<>(); + List otherAligned = new ArrayList<>(); // Extract the aligned residues of both Subunit for (int p = 1; p < aligner.getPair().getLength() + 1; p++) { @@ -318,60 +382,15 @@ public boolean mergeSequence(SubunitCluster other, SubunitClustererParameters pa // Only consider residues that are part of the SubunitCluster if (this.subunitEQR.get(this.representative).contains(thisIndex) - && other.subunitEQR.get(other.representative).contains( - otherIndex)) { + && other.subunitEQR.get(other.representative).contains(otherIndex)) { thisAligned.add(thisIndex); otherAligned.add(otherIndex); } } - // Do a List intersection to find out which EQR columns to remove - List thisRemove = new ArrayList(); - List otherRemove = new ArrayList(); - - for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) { - // If the index is aligned do nothing, otherwise mark as removing - if (!thisAligned.contains(this.subunitEQR.get(this.representative) - .get(t))) - thisRemove.add(t); - } - - for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) { - // If the index is aligned do nothing, otherwise mark as removing - if (!otherAligned.contains(other.subunitEQR.get( - other.representative).get(t))) - otherRemove.add(t); - } - // Now remove unaligned columns, from end to start - Collections.sort(thisRemove); - Collections.reverse(thisRemove); - Collections.sort(otherRemove); - Collections.reverse(otherRemove); - - for (int t = 0; t < thisRemove.size(); t++) { - for (List eqr : this.subunitEQR) { - int column = thisRemove.get(t); - eqr.remove(column); - } - } - - for (int t = 0; t < otherRemove.size(); t++) { - for (List eqr : other.subunitEQR) { - int column = otherRemove.get(t); - eqr.remove(column); - } - } - - // The representative is the longest sequence - if (this.subunits.get(this.representative).size() < other.subunits.get( - other.representative).size()) - this.representative = other.representative + subunits.size(); - - this.subunits.addAll(other.subunits); - this.subunitEQR.addAll(other.subunitEQR); + updateEquivResidues(other, thisAligned, otherAligned); this.method = SubunitClustererMethod.SEQUENCE; - pseudoStoichiometric = !params.isHighConfidenceScores(sequenceIdentity,sequenceCoverage); return true; @@ -445,8 +464,8 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p // Merge clusters List> alignedRes = msa.getBlock(0).getAlignRes(); - List thisAligned = new ArrayList(); - List otherAligned = new ArrayList(); + List thisAligned = new ArrayList<>(); + List otherAligned = new ArrayList<>(); // Extract the aligned residues of both Subunit for (int p = 0; p < msa.length(); p++) { @@ -469,24 +488,30 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p } } + updateEquivResidues(other, thisAligned, otherAligned); + + this.method = SubunitClustererMethod.STRUCTURE; + pseudoStoichiometric = true; + + return true; + } + + private void updateEquivResidues(SubunitCluster other, List thisAligned, List otherAligned) { // Do a List intersection to find out which EQR columns to remove - List thisRemove = new ArrayList(); - List otherRemove = new ArrayList(); + List thisRemove = new ArrayList<>(); + List otherRemove = new ArrayList<>(); for (int t = 0; t < this.subunitEQR.get(this.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing - if (!thisAligned.contains(this.subunitEQR.get(this.representative) - .get(t))) + if (!thisAligned.contains(this.subunitEQR.get(this.representative).get(t))) thisRemove.add(t); } for (int t = 0; t < other.subunitEQR.get(other.representative).size(); t++) { // If the index is aligned do nothing, otherwise mark as removing - if (!otherAligned.contains(other.subunitEQR.get( - other.representative).get(t))) + if (!otherAligned.contains(other.subunitEQR.get(other.representative).get(t))) otherRemove.add(t); } - // Now remove unaligned columns, from end to start Collections.sort(thisRemove); Collections.reverse(thisRemove); @@ -508,17 +533,12 @@ public boolean mergeStructure(SubunitCluster other, SubunitClustererParameters p } // The representative is the longest sequence - if (this.subunits.get(this.representative).size() < other.subunits.get( - other.representative).size()) + if (this.subunits.get(this.representative).size() < other.subunits.get(other.representative).size()) this.representative = other.representative + subunits.size(); this.subunits.addAll(other.subunits); this.subunitEQR.addAll(other.subunitEQR); - this.method = SubunitClustererMethod.STRUCTURE; - pseudoStoichiometric = true; - - return true; } /** @@ -568,9 +588,9 @@ public boolean divideInternally(SubunitClustererParameters clusterParams) List> alignedRes = result.getMultipleAlignment() .getBlock(0).getAlignRes(); - List> columns = new ArrayList>(); + List> columns = new ArrayList<>(); for (int s = 0; s < alignedRes.size(); s++) - columns.add(new ArrayList(alignedRes.get(s).size())); + columns.add(new ArrayList<>(alignedRes.get(s).size())); // Extract the aligned columns of each repeat in the Subunit for (int col = 0; col < alignedRes.get(0).size(); col++) { diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java index 7c61472574..6295f8fdf0 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClusterer.java @@ -56,12 +56,8 @@ public static Stoichiometry cluster(Structure structure, return cluster(subunits, params); } - public static Stoichiometry cluster(List subunits, - SubunitClustererParameters params) { - - // The collection of clusters to return - List clusters = new ArrayList(); - + public static Stoichiometry cluster(List subunits, SubunitClustererParameters params) { + List clusters = new ArrayList<>(); if (subunits.size() == 0) return new Stoichiometry(clusters); @@ -75,7 +71,14 @@ public static Stoichiometry cluster(List subunits, for (int c1 = 0; c1 < clusters.size(); c1++) { for (int c2 = clusters.size() - 1; c2 > c1; c2--) { try { - if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) { + if (params.isUseEntityIdForSeqIdentityDetermination() && + clusters.get(c1).mergeIdenticalByEntityId(clusters.get(c2))) { + // This we will only do if the switch is for entity id comparison is on. + // In some cases it can save enormous amounts of time, e.g. for clustering full + // chains of deposited PDB entries. For instance for 6NHJ: with pure alignments it + // takes ~ 6 hours, with entity id comparisons it takes 2 minutes. + clusters.remove(c2); + } else if (clusters.get(c1).mergeSequence(clusters.get(c2), params)) { clusters.remove(c2); } diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java index 5704012494..f3abae6c3e 100644 --- a/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java +++ b/biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java @@ -45,6 +45,8 @@ public class SubunitClustererParameters implements Serializable { private double sequenceIdentityThreshold; private double sequenceCoverageThreshold = 0.75; + private boolean useEntityIdForSeqIdentityDetermination = false; + private double rmsdThreshold = 3.0; private double structureCoverageThreshold = 0.75; private double tmThreshold = 0.5; @@ -506,5 +508,21 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo return sequenceIdentity>=hcSequenceIdentityLocal && sequenceCoverage >= hcSequenceCoverageLocal; } + /** + * Whether to use the entity id of subunits to infer that sequences are identical. + * Only applies if the {@link SubunitClustererMethod} is a sequence based one. + * @return + */ + public boolean isUseEntityIdForSeqIdentityDetermination() { + return useEntityIdForSeqIdentityDetermination; + } + /** + * Whether to use the entity id of subunits to infer that sequences are identical. + * Only applies if the {@link SubunitClustererMethod} is a sequence based one. + * @param useEntityIdForSeqIdentityDetermination the flag to be set + */ + public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) { + this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination; + } } diff --git a/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java b/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java index a44309719e..52c7e305b7 100644 --- a/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java +++ b/biojava-structure/src/test/java/org/biojava/nbio/structure/cluster/TestSubunitCluster.java @@ -52,8 +52,8 @@ public class TestSubunitCluster { @Test public void testMergeIdentical() { - // Create an Atom Array of ploy-alanine - List atoms = new ArrayList(10); + // Create an Atom Array of poly-alanine + List atoms = new ArrayList<>(10); for (int i = 0; i < 10; i++) { Group g = new AminoAcidImpl(); g.setPDBName("ALA"); @@ -79,7 +79,7 @@ public void testMergeIdentical() { assertEquals(sc1.length(), 10); // Create an Atom Array of poly-glycine - List atoms2 = new ArrayList(10); + List atoms2 = new ArrayList<>(10); for (int i = 0; i < 10; i++) { Group g = new AminoAcidImpl(); g.setPDBName("GLY"); @@ -112,7 +112,7 @@ public void testMergeIdentical() { public void testMergeSequence() throws CompoundNotFoundException { // Create an Atom Array of ploy-alanine - List atoms = new ArrayList(100); + List atoms = new ArrayList<>(100); for (int i = 0; i < 100; i++) { Group g = new AminoAcidImpl(); g.setPDBName("ALA"); @@ -163,7 +163,7 @@ public void testMergeSequence() throws CompoundNotFoundException { assertEquals(sc1.length(), 100); // Create an Atom Array of 9 glycine and 91 alanine - List atoms3 = new ArrayList(100); + List atoms3 = new ArrayList<>(100); for (int i = 0; i < 9; i++) { Group g = new AminoAcidImpl(); g.setPDBName("GLY");