Skip to content

Commit 091136e

Browse files
authored
Merge pull request #859 from josemduarte/optimise-sym-mates-clusterer
Fixes on optimization of subunit clusterer for quaternary sym detection of PDB deposited structures
2 parents 5840f24 + f779dc3 commit 091136e

File tree

8 files changed

+363
-119
lines changed

8 files changed

+363
-119
lines changed

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/symmetry/TestQuatSymmetryDetectorExamples.java

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,19 @@
2828
import org.biojava.nbio.structure.Structure;
2929
import org.biojava.nbio.structure.StructureException;
3030
import org.biojava.nbio.structure.StructureIO;
31+
import org.biojava.nbio.structure.StructureTools;
3132
import org.biojava.nbio.structure.align.util.AtomCache;
3233
import org.biojava.nbio.structure.cluster.SubunitClusterer;
3334
import org.biojava.nbio.structure.cluster.SubunitClustererMethod;
3435
import org.biojava.nbio.structure.cluster.SubunitClustererParameters;
36+
import org.biojava.nbio.structure.io.FileParsingParameters;
37+
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
38+
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
3539
import org.biojava.nbio.structure.symmetry.core.QuatSymmetryDetector;
3640
import org.biojava.nbio.structure.symmetry.core.QuatSymmetryParameters;
3741
import org.biojava.nbio.structure.symmetry.core.QuatSymmetryResults;
3842
import org.biojava.nbio.structure.symmetry.core.Stoichiometry;
43+
import org.junit.Ignore;
3944
import org.junit.Test;
4045
import org.slf4j.Logger;
4146
import org.slf4j.LoggerFactory;
@@ -366,4 +371,67 @@ public void testPseudoIdentity95() throws IOException, StructureException {
366371
assertEquals(SubunitClustererMethod.SEQUENCE, symmetry.getSubunitClusters().get(0).getClustererMethod());
367372

368373
}
374+
375+
@Test
376+
public void testSymDetectionWithClusteringByEntityId() throws IOException, StructureException {
377+
AtomCache cache = new AtomCache();
378+
cache.setUseMmtf(false);
379+
cache.setUseMmCif(true);
380+
FileParsingParameters params = new FileParsingParameters();
381+
params.setAlignSeqRes(true);
382+
cache.setFileParsingParams(params);
383+
StructureIO.setAtomCache(cache);
384+
Structure pdb = StructureIO.getStructure("BIO:1SMT:1");
385+
386+
SubunitClustererParameters cp = new SubunitClustererParameters();
387+
cp.setUseEntityIdForSeqIdentityDetermination(true);
388+
cp.setClustererMethod(SubunitClustererMethod.SEQUENCE);
389+
QuatSymmetryParameters symmParams = new QuatSymmetryParameters();
390+
QuatSymmetryResults symmetry = QuatSymmetryDetector.calcGlobalSymmetry(
391+
pdb, symmParams, cp);
392+
393+
// C2 symmetry, A2 stoichiometry
394+
assertEquals("C2", symmetry.getSymmetry());
395+
assertEquals("A2", symmetry.getStoichiometry().toString());
396+
}
397+
398+
/**
399+
* A performance test that demonstrates how the SubunitClustererParameters.setUseEntityIdForSeqIdentityDetermination()
400+
* has a dramatic effect in runtime versus doing alignments.
401+
*/
402+
@Ignore("This is a performance test to be run manually")
403+
@Test
404+
public void testSymDetectionPerformanceLargeCapsid() throws IOException, StructureException {
405+
AtomCache cache = new AtomCache();
406+
cache.setUseMmtf(false);
407+
cache.setUseMmCif(true);
408+
FileParsingParameters params = new FileParsingParameters();
409+
params.setAlignSeqRes(true);
410+
params.setParseBioAssembly(true);
411+
cache.setFileParsingParams(params);
412+
StructureIO.setAtomCache(cache);
413+
414+
// making sure we remove all atoms but representative before we expand, otherwise memory requirements are huge
415+
// 6Q1F is another good example
416+
Structure au = StructureIO.getStructure("6NHJ");
417+
StructureTools.reduceToRepresentativeAtoms(au);
418+
BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
419+
List<BiologicalAssemblyTransformation> transforms = au.getPDBHeader().getBioAssemblies().get(1).getTransforms();
420+
Structure pdb = builder.rebuildQuaternaryStructure(au, transforms, true, false);
421+
422+
SubunitClustererParameters cp = new SubunitClustererParameters();
423+
424+
// This is the parameter that makes this fast, set it to false to see the difference.
425+
// As of git commit ed322e387cd46344a7864a, the difference in runtime is not that huge:
426+
// 2 minutes with true, 10 minutes with false. I observed a much larger difference before, but can't reproduce anymore - JD 2020-01-23
427+
cp.setUseEntityIdForSeqIdentityDetermination(true);
428+
429+
cp.setClustererMethod(SubunitClustererMethod.SEQUENCE);
430+
QuatSymmetryParameters symmParams = new QuatSymmetryParameters();
431+
QuatSymmetryResults symmetry = QuatSymmetryDetector.calcGlobalSymmetry(
432+
pdb, symmParams, cp);
433+
434+
assertEquals("I", symmetry.getSymmetry());
435+
assertEquals("A960B960C600D480E300", symmetry.getStoichiometry().toString());
436+
}
369437
}

biojava-structure/src/main/java/org/biojava/nbio/structure/EntityInfo.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,12 +306,12 @@ public List<String> getChainIds() {
306306
* used and when all chains within the entity are numbered in the same way), but
307307
* in general they will be neither unique (because of insertion codes) nor aligned.
308308
* </p>
309-
* @param g
310-
* @param c
309+
* @param g the group
310+
* @param c the chain
311311
* @return the aligned residue index (1 to n), if no SEQRES groups are available at all then {@link ResidueNumber#getSeqNum()}
312312
* is returned as a fall-back, if the group is not found in the SEQRES groups then -1 is returned
313313
* for the given group and chain
314-
* @throws IllegalArgumentException if the given Chain is not a member of this EnityInfo
314+
* @throws IllegalArgumentException if the given Chain is not a member of this EntityInfo
315315
* @see Chain#getSeqResGroup(int)
316316
*/
317317
public int getAlignedResIndex(Group g, Chain c) {

biojava-structure/src/main/java/org/biojava/nbio/structure/StructureTools.java

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,7 +1264,7 @@ public static final Character get1LetterCode(String groupCode3) {
12641264
* 3-character code for a group.
12651265
*
12661266
*/
1267-
public static final boolean isNucleotide(String groupCode3) {
1267+
public static boolean isNucleotide(String groupCode3) {
12681268
String code = groupCode3.trim();
12691269
return nucleotides30.containsKey(code)
12701270
|| nucleotides23.containsKey(code);
@@ -1283,7 +1283,7 @@ public static final boolean isNucleotide(String groupCode3) {
12831283
* @deprecated Use {@link StructureIdentifier#reduce(Structure)} instead (v. 4.2.0)
12841284
*/
12851285
@Deprecated
1286-
public static final Structure getReducedStructure(Structure s,
1286+
public static Structure getReducedStructure(Structure s,
12871287
String chainId) throws StructureException {
12881288
// since we deal here with structure alignments,
12891289
// only use Model 1...
@@ -1338,7 +1338,7 @@ public static final Structure getReducedStructure(Structure s,
13381338
return newS;
13391339
}
13401340

1341-
public static final String convertAtomsToSeq(Atom[] atoms) {
1341+
public static String convertAtomsToSeq(Atom[] atoms) {
13421342

13431343
StringBuilder buf = new StringBuilder();
13441344
Group prevGroup = null;
@@ -1374,7 +1374,7 @@ public static final String convertAtomsToSeq(Atom[] atoms) {
13741374
* @throws StructureException
13751375
* if the group cannot be found.
13761376
*/
1377-
public static final Group getGroupByPDBResidueNumber(Structure struc,
1377+
public static Group getGroupByPDBResidueNumber(Structure struc,
13781378
ResidueNumber pdbResNum) throws StructureException {
13791379
if (struc == null || pdbResNum == null) {
13801380
throw new IllegalArgumentException("Null argument(s).");
@@ -1447,7 +1447,7 @@ public static AtomContactSet getAtomsInContact(Chain chain, double cutoff) {
14471447
* @param chain
14481448
* @param cutoff
14491449
* @return
1450-
* @see {@link #getRepresentativeAtomsInContact(Chain, double)}
1450+
* @see #getRepresentativeAtomsInContact(Chain, double)
14511451
*/
14521452
public static AtomContactSet getAtomsCAInContact(Chain chain, double cutoff) {
14531453
Grid grid = new Grid(cutoff);
@@ -1921,4 +1921,24 @@ private static String replaceFirstChar(String name, char c, char d) {
19211921
return name;
19221922
}
19231923

1924+
/**
1925+
* Remove all atoms but the representative atoms (C alphas or phosphates) from the given structure.
1926+
* @param structure the structure
1927+
* @since 5.4.0
1928+
*/
1929+
public static void reduceToRepresentativeAtoms(Structure structure) {
1930+
for (int modelIdx = 0; modelIdx<structure.nrModels(); modelIdx++) {
1931+
for (Chain c : structure.getPolyChains(modelIdx)) {
1932+
for (Group g : c.getAtomGroups()) {
1933+
List<Atom> atoms = g.getAtoms();
1934+
if (g.isAminoAcid()) {
1935+
atoms.removeIf(a->!a.getName().equals(CA_ATOM_NAME));
1936+
} else if (g.isNucleotide()) {
1937+
atoms.removeIf(a->!a.getName().equals(NUCLEOTIDE_REPRESENTATIVE));
1938+
}
1939+
// else we keep all other atoms. We are concerned only about aminoacids and nucleotides that make up the bulk of the structures
1940+
}
1941+
}
1942+
}
1943+
}
19241944
}

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitCluster.java

Lines changed: 76 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
3333
import org.biojava.nbio.structure.Atom;
3434
import org.biojava.nbio.structure.Chain;
35+
import org.biojava.nbio.structure.EntityInfo;
36+
import org.biojava.nbio.structure.Group;
3537
import org.biojava.nbio.structure.Structure;
3638
import org.biojava.nbio.structure.StructureException;
3739
import org.biojava.nbio.structure.align.StructureAlignment;
@@ -190,22 +192,35 @@ public boolean isIdenticalTo(SubunitCluster other) {
190192
* @return true if the SubunitClusters are identical, false otherwise
191193
*/
192194
public boolean isIdenticalByEntityIdTo(SubunitCluster other) {
193-
Structure thisStruct = this.subunits.get(this.representative).getStructure();
194-
Structure otherStruct = other.subunits.get(other.representative).getStructure();
195-
String thisName = this.subunits.get(this.representative).getName();
196-
String otherName = other.subunits.get(this.representative).getName();
195+
Subunit thisSub = this.subunits.get(this.representative);
196+
Subunit otherSub = other.subunits.get(other.representative);
197+
String thisName = thisSub.getName();
198+
String otherName = otherSub.getName();
199+
200+
Structure thisStruct = thisSub.getStructure();
201+
Structure otherStruct = otherSub.getStructure();
202+
if (thisStruct == null || otherStruct == null) {
203+
logger.info("SubunitClusters {}-{} have no referenced structures. Ignoring identity check by entity id",
204+
thisName,
205+
otherName);
206+
return false;
207+
}
208+
if (thisStruct != otherStruct) {
209+
// different object references: will not cluster even if entity id is same
210+
return false;
211+
}
197212
Chain thisChain = thisStruct.getChain(thisName);
198213
Chain otherChain = otherStruct.getChain(otherName);
199214
if (thisChain == null || otherChain == null) {
200215
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
201-
this.subunits.get(this.representative).getName(),
202-
other.subunits.get(other.representative).getName());
216+
thisName,
217+
otherName);
203218
return false;
204219
}
205220
if (thisChain.getEntityInfo() == null || otherChain.getEntityInfo() == null) {
206221
logger.info("Can't determine entity ids of SubunitClusters {}-{}. Ignoring identity check by entity id",
207-
this.subunits.get(this.representative).getName(),
208-
other.subunits.get(other.representative).getName());
222+
thisName,
223+
otherName);
209224
return false;
210225
}
211226
int thisEntityId = thisChain.getEntityInfo().getMolId();
@@ -241,7 +256,7 @@ public boolean mergeIdentical(SubunitCluster other) {
241256
* same Subunit. This is checked by comparing the entity identifiers of the subunits
242257
* if one can be found.
243258
* Thus this only makes sense when the subunits are complete chains of a
244-
* deposited PDB entry. I
259+
* deposited PDB entry.
245260
*
246261
* @param other
247262
* SubunitCluster
@@ -252,12 +267,59 @@ public boolean mergeIdenticalByEntityId(SubunitCluster other) {
252267
if (!isIdenticalByEntityIdTo(other))
253268
return false;
254269

270+
Subunit thisSub = this.subunits.get(this.representative);
271+
Subunit otherSub = other.subunits.get(other.representative);
272+
String thisName = thisSub.getName();
273+
String otherName = otherSub.getName();
274+
255275
logger.info("SubunitClusters {}-{} belong to same entity. Assuming they are identical",
256-
this.subunits.get(this.representative).getName(),
257-
other.subunits.get(other.representative).getName());
276+
thisName,
277+
otherName);
258278

259-
this.subunits.addAll(other.subunits);
260-
this.subunitEQR.addAll(other.subunitEQR);
279+
List<Integer> thisAligned = new ArrayList<>();
280+
List<Integer> otherAligned = new ArrayList<>();
281+
282+
// we've merged by entity id, we can assume structure, chain and entity are available (checked in isIdenticalByEntityIdTo())
283+
Structure thisStruct = thisSub.getStructure();
284+
Structure otherStruct = otherSub.getStructure();
285+
Chain thisChain = thisStruct.getChain(thisName);
286+
Chain otherChain = otherStruct.getChain(otherName);
287+
EntityInfo entityInfo = thisChain.getEntityInfo();
288+
289+
// Extract the aligned residues of both Subunits
290+
for (int thisIndex=0; thisIndex < thisSub.size(); thisIndex++) {
291+
292+
Group g = thisSub.getRepresentativeAtoms()[thisIndex].getGroup();
293+
294+
int seqresIndex = entityInfo.getAlignedResIndex(g, thisChain);
295+
296+
if (seqresIndex == -1) {
297+
// this might mean that FileParsingParameters.setAlignSeqRes() wasn't set to true during parsing
298+
continue;
299+
}
300+
301+
// note the seqresindex is 1-based
302+
Group otherG = otherChain.getSeqResGroups().get(seqresIndex - 1);
303+
304+
int otherIndex = otherChain.getAtomGroups().indexOf(otherG);
305+
if (otherIndex == -1) {
306+
// skip residues that are unobserved in other sequence ("gaps" in the entity SEQRES alignment)
307+
continue;
308+
}
309+
310+
// Only consider residues that are part of the SubunitCluster
311+
if (this.subunitEQR.get(this.representative).contains(thisIndex)
312+
&& other.subunitEQR.get(other.representative).contains(otherIndex)) {
313+
thisAligned.add(thisIndex);
314+
otherAligned.add(otherIndex);
315+
}
316+
}
317+
318+
if (thisAligned.size() == 0 && otherAligned.size() == 0) {
319+
logger.warn("No equivalent aligned atoms found between SubunitClusters {}-{} via entity SEQRES alignment. Is FileParsingParameters.setAlignSeqRes() set?", thisName, otherName);
320+
}
321+
322+
updateEquivResidues(other, thisAligned, otherAligned);
261323

262324
return true;
263325
}
@@ -690,7 +752,7 @@ public SubunitClustererMethod getClustererMethod() {
690752
*/
691753
public List<Atom[]> getAlignedAtomsSubunits() {
692754

693-
List<Atom[]> alignedAtoms = Collections.emptyList();
755+
List<Atom[]> alignedAtoms = new ArrayList<>();
694756

695757
// Loop through all subunits and add the aligned positions
696758
for (int s = 0; s < subunits.size(); s++)

biojava-structure/src/main/java/org/biojava/nbio/structure/cluster/SubunitClustererParameters.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,8 @@ public boolean isHighConfidenceScores(double sequenceIdentity, double sequenceCo
511511
/**
512512
* Whether to use the entity id of subunits to infer that sequences are identical.
513513
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
514-
* @return
514+
* @return the flag
515+
* @since 5.4.0
515516
*/
516517
public boolean isUseEntityIdForSeqIdentityDetermination() {
517518
return useEntityIdForSeqIdentityDetermination;
@@ -520,7 +521,10 @@ public boolean isUseEntityIdForSeqIdentityDetermination() {
520521
/**
521522
* Whether to use the entity id of subunits to infer that sequences are identical.
522523
* Only applies if the {@link SubunitClustererMethod} is a sequence based one.
524+
* Note this requires {@link org.biojava.nbio.structure.io.FileParsingParameters#setAlignSeqRes(boolean)} to be
525+
* set to true.
523526
* @param useEntityIdForSeqIdentityDetermination the flag to be set
527+
* @since 5.4.0
524528
*/
525529
public void setUseEntityIdForSeqIdentityDetermination(boolean useEntityIdForSeqIdentityDetermination) {
526530
this.useEntityIdForSeqIdentityDetermination = useEntityIdForSeqIdentityDetermination;

biojava-structure/src/main/java/org/biojava/nbio/structure/symmetry/core/C2RotationSolver.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@
3939
* @author Peter
4040
*/
4141
public class C2RotationSolver implements QuatSymmetrySolver {
42-
private QuatSymmetrySubunits subunits = null;
43-
private QuatSymmetryParameters parameters = null;
42+
private QuatSymmetrySubunits subunits;
43+
private QuatSymmetryParameters parameters;
4444
private Vector3d centroid = new Vector3d();
4545
private Matrix4d centroidInverse = new Matrix4d();
4646

@@ -132,7 +132,7 @@ private void solve() {
132132
}
133133

134134
private void addEOperation() {
135-
List<Integer> permutation = Arrays.asList(new Integer[]{0,1});
135+
List<Integer> permutation = Arrays.asList(0,1);
136136
Matrix4d transformation = new Matrix4d();
137137
transformation.setIdentity();
138138
combineWithTranslation(transformation);
@@ -145,7 +145,6 @@ private void addEOperation() {
145145

146146
/**
147147
* Adds translational component to rotation matrix
148-
* @param rotTrans
149148
* @param rotation
150149
* @return
151150
*/

0 commit comments

Comments
 (0)