Fixing a few bugs that arose after loadChemCompInfo removal, biojava#426

josemduarte · josemduarte · commit 8da9d07145fe · 2016-03-08T14:38:56.000-08:00
All tests pass now
diff --git a/biojava-alignment/src/main/java/org/biojava/nbio/alignment/Alignments.java b/biojava-alignment/src/main/java/org/biojava/nbio/alignment/Alignments.java
@@ -321,7 +321,6 @@ public static <S extends Sequence<C>, C extends Compound> PairwiseSequenceAligne
             S query, S target, PairwiseSequenceAlignerType type, GapPenalty gapPenalty,
             SubstitutionMatrix<C> subMatrix) {
     	if (!query.getCompoundSet().equals(target.getCompoundSet())) {
-    		System.err.println(query.getCompoundSet().getClass().getName() + " != " + target.getCompoundSet().getClass().getName());
     		throw new IllegalArgumentException("Sequence compound sets must be the same");
     	}
         switch (type) {
diff --git a/biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestLongPdbVsMmCifParsing.java b/biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestLongPdbVsMmCifParsing.java
@@ -124,7 +124,7 @@ public void testVeryLongPdbVsMmCif() throws IOException, StructureException {
 	
 	@Test
 	public void testSingle() throws IOException, StructureException {
-		testAll(Arrays.asList("1jnv"));
+		testAll(Arrays.asList("1bcr"));
 	}
 	
 	@After
@@ -429,6 +429,12 @@ private void testSingleChain(Chain cPdb, Chain cCif) {
 
 			
 		assertEquals("failed for getAtomLength (chain "+chainId+"):",cPdb.getAtomLength(),cCif.getAtomLength());
+		
+		// entries with polymers composed of all unknowns (giving only-X sequences) can't be aligned seqres-to-atom (for PDB files)
+		// we've got to skip them because they won't have seqres groups
+		// e.g. is 1jnv chain A
+		
+		if (cPdb.getAtomSequence().matches("^X+$")) return;
 
 		// note for getSeqResLength to work one needs the setAlignSeqRes option in the parsers
 		
diff --git a/biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestSeqResParsing.java b/biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestSeqResParsing.java
@@ -24,57 +24,56 @@
 package org.biojava.nbio.structure.test.io;
 
 
-import junit.framework.TestCase;
+import java.io.IOException;
+
 import org.biojava.nbio.structure.AminoAcid;
 import org.biojava.nbio.structure.Chain;
 import org.biojava.nbio.structure.Group;
 import org.biojava.nbio.structure.Structure;
+import org.biojava.nbio.structure.StructureException;
 import org.biojava.nbio.structure.align.util.AtomCache;
+import org.junit.Test;
 import org.biojava.nbio.structure.StructureIO;
 
-public class TestSeqResParsing extends TestCase {
+import static org.junit.Assert.*;
+
+public class TestSeqResParsing {
 
-	public void test11GS(){
+	@Test
+	public void test11GS() throws IOException, StructureException{
 
 		String pdbID = "11GS";
 
 		Structure s;
-		try {
-			AtomCache cache = new AtomCache();
-			cache.getFileParsingParams().setAlignSeqRes(true);
-			
-			StructureIO.setAtomCache(cache);
-			
-			s = StructureIO.getStructure(pdbID);
-			assertNotNull(s);
-			assertTrue(s.getChains().size() > 0);
-			Chain c = s.getChain(0);
-
-			assertTrue(c.getSeqResGroups().size() > 2);
-			
-			Group first  = c.getSeqResGroup(0);
-			Group second = c.getSeqResGroup(1);
-			Group third  = c.getSeqResGroup(2);
-
-			assertTrue(first instanceof AminoAcid);
-			assertTrue(second instanceof AminoAcid);
-			assertTrue(third instanceof AminoAcid);
-			
-			AminoAcid aafirst = (AminoAcid) first;
-			AminoAcid aasecond = (AminoAcid)second;
-			AminoAcid aathird = (AminoAcid) third;
-			
-			assertTrue(aafirst.getRecordType().equals(AminoAcid.SEQRESRECORD));
-			assertTrue(aasecond.getRecordType().equals(AminoAcid.SEQRESRECORD));
-			assertTrue(aathird.getRecordType().equals(AminoAcid.ATOMRECORD));
-		
-
-		} catch (Exception e) {
-
-			e.printStackTrace();
-			fail(e.getMessage());
-		}
-		
+
+		AtomCache cache = new AtomCache();
+		cache.getFileParsingParams().setAlignSeqRes(true);
+
+		StructureIO.setAtomCache(cache);
+
+		s = StructureIO.getStructure(pdbID);
+		assertNotNull(s);
+		assertTrue(s.getChains().size() > 0);
+		Chain c = s.getChain(0);
+
+		assertTrue(c.getSeqResGroups().size() > 2);
+
+		Group first  = c.getSeqResGroup(0);
+		Group second = c.getSeqResGroup(1);
+		Group third  = c.getSeqResGroup(2);
+
+		assertTrue(first instanceof AminoAcid);
+		assertTrue(second instanceof AminoAcid);
+		assertTrue(third instanceof AminoAcid);
+
+		AminoAcid aafirst = (AminoAcid) first;
+		AminoAcid aasecond = (AminoAcid)second;
+		AminoAcid aathird = (AminoAcid) third;
+
+		assertEquals(AminoAcid.SEQRESRECORD, aafirst.getRecordType());
+		assertEquals(AminoAcid.SEQRESRECORD, aasecond.getRecordType());
+		assertEquals(AminoAcid.ATOMRECORD, aathird.getRecordType());
 
 	}
+	
 }
diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/SeqRes2AtomAligner.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/SeqRes2AtomAligner.java
@@ -585,8 +585,9 @@ private boolean alignProteinChains(List<Group> seqRes, List<Group> atomRes) {
 		SequencePair<ProteinSequence, AminoAcidCompound> pair = smithWaterman.getPair();
 
 
-
-		if ( pair == null) {
+		// sequences that are only X (e.g. 1jnv chain A) produced empty alignments, because nothing aligns to nothing and thus the local alignment is empty
+		// to avoid those empty alignments we catch them here with pair.getLength()==0
+		if ( pair == null || pair.getLength()==0) { 
 			logger.warn("Could not align protein sequences. ATOM and SEQRES groups will not be aligned.");
 			logger.warn("Sequences: ");
 			logger.warn(seq1);
diff --git a/biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java b/biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java
@@ -1590,25 +1590,22 @@ public void newStructRefSeqDif(StructRefSeqDif sref) {
 		sequenceDifs.add(sref);
 	}
 
-	private static Chain getChainFromList(List<Chain> chains, String name){
-		for (Chain chain : chains) {
-			if ( chain.getChainID().equals(name)){
+	private Chain getEntityChain(String entity_id){
+		
+		for (Chain chain : entityChains) {
+			if ( chain.getChainID().equals(entity_id)){
 
 				return chain;
 			}
 		}
 		// does not exist yet, so create...
 
 		Chain	chain = new ChainImpl();
-		chain.setChainID(name);
-		chains.add(chain);
+		chain.setChainID(entity_id);
+		entityChains.add(chain);
 
 		return chain;
-	}
-
-	private Chain getEntityChain(String entity_id){
 
-		return getChainFromList(entityChains,entity_id);
 	}
 
 	//private Chain getSeqResChain(String chainID){
@@ -1646,7 +1643,8 @@ public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
 		entitySrcSyns.add(entitySrcSyn);
 	}
 
-	/** The EntityPolySeq object provide the amino acid sequence objects for the Entities.
+	/** 
+	 * The EntityPolySeq object provide the amino acid sequence objects for the Entities.
 	 * Later on the entities are mapped to the BioJava Chain and Compound objects.
 	 * @param epolseq the EntityPolySeq record for one amino acid
 	 */
@@ -1671,40 +1669,48 @@ public void newEntityPolySeq(EntityPolySeq epolseq) {
 		Chain entityChain = getEntityChain(epolseq.getEntity_id());
 
 
-		// create group from epolseq;
-		// by default this are the SEQRES records...
-
-
-		if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
-			AminoAcid g = new AminoAcidImpl();
-
-			g.setRecordType(AminoAcid.SEQRESRECORD);
-
-			g.setPDBName(epolseq.getMon_id());
-
-			Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
-			g.setAminoType(code1);
+		// first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
+		// TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
+		
+		Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id());
+		//int seqId = Integer.parseInt(epolseq.getNum());
+		if ( g != null && !g.getChemComp().isEmpty()) {
+			if ( g instanceof AminoAcidImpl) {
+				AminoAcidImpl aa = (AminoAcidImpl) g;
+				aa.setRecordType(AminoAcid.SEQRESRECORD);
+				//aa.setId(seqId);
+			} 
+		} else {
 
-			g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
-			// ARGH at this stage we don't know about insertion codes
-			// this has to be obtained from _pdbx_poly_seq_scheme
-			entityChain.addGroup(g);
+			if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
+				AminoAcidImpl a = new AminoAcidImpl();
+				a.setRecordType(AminoAcid.SEQRESRECORD);
+				Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
+				a.setAminoType(code1);
+				g = a;
+
+			} else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
+				// the group is actually a nucleotide group...
+				NucleotideImpl n = new NucleotideImpl();
+				g = n;
+								
+			} else {				
+				logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
+				HetatomImpl h = new HetatomImpl();				
+				g = h;
 
-		} else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
-			// the group is actually a nucleotide group...
-			NucleotideImpl n = new NucleotideImpl();
+			}
 			
-			n.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
-			n.setPDBName(epolseq.getMon_id());
-			entityChain.addGroup(n);				
-		} else {				
-			logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
-			HetatomImpl h = new HetatomImpl();				
-			h.setPDBName(epolseq.getMon_id());
-			h.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
-			entityChain.addGroup(h);
 
 		}
+		// at this stage we don't know about author residue numbers (insertion codes)
+		// we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n)
+		// later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
+		g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
+		
+		g.setPDBName(epolseq.getMon_id());
+		
+		entityChain.addGroup(g);
 
 	}
 
diff --git a/biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestHeaderOnly.java b/biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestHeaderOnly.java
@@ -23,7 +23,7 @@
 
 public class TestHeaderOnly {
 
-	final String pdbID = "1REP";
+	private final String pdbID = "1REP";
 	
 	/**
 	 * All groups are expected to be empty.

Original file line number	Diff line number	Diff line change
`@@ -321,7 +321,6 @@ public static <S extends Sequence<C>, C extends Compound> PairwiseSequenceAligne`
`321`	`321`	`S query, S target, PairwiseSequenceAlignerType type, GapPenalty gapPenalty,`
`322`	`322`	`SubstitutionMatrix<C> subMatrix) {`
`323`	`323`	`if (!query.getCompoundSet().equals(target.getCompoundSet())) {`
`324`		`- System.err.println(query.getCompoundSet().getClass().getName() + " != " + target.getCompoundSet().getClass().getName());`
`325`	`324`	`throw new IllegalArgumentException("Sequence compound sets must be the same");`
`326`	`325`	`}`
`327`	`326`	`switch (type) {`