Skip to content

Commit 2498254

Browse files
committed
create new simple InsdcParser test
add loading parent DNASequence in factory change logger levels in GenbankProxySequenceReader add documentation to tests
1 parent a5c2617 commit 2498254

File tree

7 files changed

+98
-413
lines changed

7 files changed

+98
-413
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/ProteinSequence.java

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import java.io.InputStream;
4242
import java.net.URL;
4343
import java.util.LinkedHashMap;
44+
import java.util.List;
45+
import org.biojava.nbio.core.sequence.features.Qualifier;
4446

4547
/**
4648
* The representation of a ProteinSequence
@@ -50,22 +52,21 @@
5052
*/
5153
public class ProteinSequence extends AbstractSequence<AminoAcidCompound> {
5254

53-
private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class);
54-
55-
/*
56-
private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features
57-
= new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>();
58-
private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures
59-
= new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>();
60-
*/
55+
private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class);
6156

57+
/*
58+
private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features
59+
= new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>();
60+
private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures
61+
= new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>();
62+
*/
6263
/**
6364
* Create a protein from a string
6465
*
6566
* @param seqString
66-
* @throws CompoundNotFoundException
67+
* @throws CompoundNotFoundException
6768
*/
68-
public ProteinSequence(String seqString) throws CompoundNotFoundException {
69+
public ProteinSequence(String seqString) throws CompoundNotFoundException {
6970
this(seqString, AminoAcidCompoundSet.getAminoAcidCompoundSet());
7071
}
7172

@@ -74,7 +75,7 @@ public ProteinSequence(String seqString) throws CompoundNotFoundException {
7475
*
7576
* @param seqString
7677
* @param compoundSet
77-
* @throws CompoundNotFoundException
78+
* @throws CompoundNotFoundException
7879
*/
7980
public ProteinSequence(String seqString, CompoundSet<AminoAcidCompound> compoundSet) throws CompoundNotFoundException {
8081
super(seqString, compoundSet);
@@ -97,9 +98,35 @@ public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader) {
9798
* file or via a Uniprot Proxy reader via Uniprot ID
9899
*
99100
* @param proxyLoader
101+
* @param compoundSet
100102
*/
101103
public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader, CompoundSet<AminoAcidCompound> compoundSet) {
102104
super(proxyLoader, compoundSet);
105+
106+
// do protein-specific tasks
107+
// add source if found
108+
List<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> CDSFeatures = getFeaturesByType("CDS");
109+
110+
// cases if a protein has more than 1 parent are not supported yet
111+
if (CDSFeatures.size() == 1) {
112+
Qualifier codedBy = CDSFeatures.get(0).getQualifiers().get("coded_by");
113+
114+
if (codedBy != null) {
115+
String codedBySeq = codedBy.getValue();
116+
117+
InsdcParser parser = new InsdcParser(DataSource.GENBANK);
118+
Location location = parser.parse(codedBySeq);
119+
120+
try {
121+
DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet());
122+
setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition());
123+
} catch (CompoundNotFoundException e) {
124+
// TODO is there another solution to handle this exception?
125+
logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}", e.getMessage());
126+
}
127+
}
128+
}
129+
103130
}
104131

105132
/**
@@ -125,39 +152,12 @@ public void setParentDNASequence(AbstractSequence<NucleotideCompound> parentDNAS
125152
setBioEnd(end);
126153
}
127154

128-
/**
129-
* Add feature.
130-
* <p>
131-
* If feature is type 'coded_by' than resolves parent DNA sequence.
132-
* </p>
133-
* @param feature
134-
*/
135-
@Override
136-
public void addFeature(FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound> feature) {
137-
super.addFeature(feature);
138-
139-
// if feature is called 'coded_by' than add parent DNA location
140-
if (feature.getType().equals("coded_by")) {
141-
InsdcParser parser = new InsdcParser(DataSource.GENBANK);
142-
143-
Location location = parser.parse(feature.getSource());
144-
// convert location into DNASequence
145-
try {
146-
DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet());
147-
setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition());
148-
} catch (CompoundNotFoundException e) {
149-
// TODO is there another solution to handle this exception?
150-
logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}",e.getMessage());
151-
}
152-
}
153-
}
154-
155155
private DNASequence getRawParentSequence(String accessId) throws IOException {
156156
String seqUrlTemplate = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text";
157157
URL url = new URL(String.format(seqUrlTemplate, accessId));
158-
159-
logger.info("Getting parent DNA sequence from URL: {}", url.toString());
160-
158+
159+
logger.trace("Getting parent DNA sequence from URL: {}", url.toString());
160+
161161
InputStream is = url.openConnection().getInputStream();
162162

163163
FastaReader<DNASequence, NucleotideCompound> parentReader
@@ -176,14 +176,14 @@ private DNASequence getRawParentSequence(String accessId) throws IOException {
176176
}
177177

178178
private String getSequence(Location cdna) {
179-
DNASequence rawParent = null;
179+
DNASequence rawParent;
180180
if (!cdna.isComplex()) {
181181
try {
182182
rawParent = getRawParentSequence(cdna.getAccession().getID());
183183
return cdna.getSubSequence(rawParent).getSequenceAsString();
184184
} catch (IOException e) {
185185
// return null
186-
logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage());
186+
logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage());
187187
return null;
188188
}
189189
} else {

biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ private BufferedInputStream getBufferedInputStream(String accessionID, String db
103103
if (genbankDirectoryCache != null && genbankDirectoryCache.length() > 0) {
104104
File f = new File(genbankDirectoryCache + File.separatorChar + accessionID + ".gb");
105105
if (f.exists()) {
106-
logger.info("Reading: {}", f.toString());
106+
logger.debug("Reading: {}", f.toString());
107107
inStream = new BufferedInputStream(new FileInputStream(f));
108108
} else {
109109
InputStream in = getEutilsInputStream(accessionID, db);
@@ -135,7 +135,7 @@ private void copyInputStreamToFile(InputStream in, File f) throws IOException, I
135135

136136
private InputStream getEutilsInputStream(String accessionID, String db) throws IOException {
137137
String genbankURL = eutilBaseURL + "efetch.fcgi?db=" + db + "&id=" + accessionID + "&rettype=gb&retmode=text";
138-
logger.info("Loading: {}", genbankURL);
138+
logger.trace("Loading: {}", genbankURL);
139139
URL genbank = new URL(genbankURL);
140140
URLConnection genbankConnection = genbank.openConnection();
141141
return genbankConnection.getInputStream();

biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,12 @@
3434
import org.slf4j.LoggerFactory;
3535

3636
import java.io.IOException;
37+
import java.util.ArrayList;
3738
import java.util.Arrays;
3839
import java.util.Collection;
40+
import org.biojava.nbio.core.sequence.features.AbstractFeature;
41+
import org.biojava.nbio.core.sequence.features.Qualifier;
42+
import org.junit.Ignore;
3943

4044
/**
4145
* Testing example for issue #834
@@ -73,19 +77,23 @@ public static Collection<String[]> getExamples() {
7377
}
7478

7579
@Test
76-
public void biojava3() throws IOException, InterruptedException, CompoundNotFoundException {
80+
public void testFeatures() throws IOException, InterruptedException, CompoundNotFoundException {
7781
logger.info("run test for protein: {}", gi);
7882
GenbankProxySequenceReader<AminoAcidCompound> genbankReader
7983
= new GenbankProxySequenceReader<AminoAcidCompound>(System.getProperty("java.io.tmpdir"),
8084
this.gi,
8185
AminoAcidCompoundSet.getAminoAcidCompoundSet());
8286

8387
// why only tests on protein sequences?
84-
ProteinSequence seq = new ProteinSequence(genbankReader, AminoAcidCompoundSet.getAminoAcidCompoundSet());
88+
ProteinSequence seq = new ProteinSequence(genbankReader);
8589

8690
Assert.assertNotNull("protein sequence is null", seq);
91+
92+
/*
93+
parse description from header. There is no separate interface/abstract class for method getHeader()
94+
so it should be done here (manualy).
95+
*/
8796
genbankReader.getHeaderParser().parseHeader(genbankReader.getHeader(), seq);
88-
8997
Assert.assertTrue(seq.getDescription() != null);
9098

9199
Assert.assertFalse(seq.getFeaturesKeyWord().getKeyWords().isEmpty());
@@ -104,4 +112,42 @@ public void biojava3() throws IOException, InterruptedException, CompoundNotFoun
104112
logger.info("\t\tcoded_by: {}", codedBy);
105113
}
106114
}
115+
116+
@Test
117+
public void testProteinSequenceFactoring() throws Exception {
118+
logger.info("create protein sequence test for target {}", gi);
119+
120+
GenbankProxySequenceReader<AminoAcidCompound> genbankReader
121+
= new GenbankProxySequenceReader<AminoAcidCompound>(System.getProperty("java.io.tmpdir"),
122+
this.gi,
123+
AminoAcidCompoundSet.getAminoAcidCompoundSet());
124+
125+
ProteinSequence seq = new ProteinSequence(genbankReader);
126+
127+
// if target protein contain CDS/coded_by than it should contain parent nucleotide seq
128+
ArrayList<AbstractFeature> CDSs = genbankReader.getFeatures().get("CDS");
129+
130+
if (CDSs != null) {
131+
if (CDSs.size() == 1) {
132+
Qualifier codedBy = (Qualifier) CDSs.get(0).getQualifiers().get("coded_by");
133+
if (codedBy != null) {
134+
135+
AbstractSequence<?> parentSeq = seq.getParentSequence();
136+
Assert.assertNotNull(parentSeq);
137+
138+
/*
139+
Sometimes protein might have many 'parents' with different accessions
140+
so accession is not set.
141+
142+
That test is always failed
143+
*/
144+
//Assert.assertTrue(parentSeq.getAccession());
145+
Assert.assertTrue(!parentSeq.getSequenceAsString().isEmpty());
146+
}
147+
}
148+
} else {
149+
logger.info("target {} has no CDS", gi);
150+
}
151+
152+
}
107153
}

biojava-core/src/test/java/org/biojava/nbio/core/sequence/location/LocationParserTest.java

Lines changed: 0 additions & 130 deletions
This file was deleted.

0 commit comments

Comments
 (0)