Skip to content

Commit be50915

Browse files
authored
Merge pull request #660 from heuermh/read-rna-sequences
Add methods for reading RNA sequences.
2 parents 0678b04 + 0169997 commit be50915

File tree

3 files changed

+230
-0
lines changed

3 files changed

+230
-0
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FastaReaderHelper.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323

2424
import org.biojava.nbio.core.sequence.DNASequence;
2525
import org.biojava.nbio.core.sequence.ProteinSequence;
26+
import org.biojava.nbio.core.sequence.RNASequence;
2627
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
2728
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
2829
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
2930
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
31+
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
3032

3133
import java.io.File;
3234
import java.io.FileInputStream;
@@ -68,6 +70,34 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file,
6870

6971
}
7072

73+
/**
74+
* Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
75+
* that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
76+
* in one sequence based on accession id.
77+
* @param file
78+
* @param lazySequenceLoad
79+
* @return
80+
* @throws IOException
81+
*/
82+
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(File file, boolean lazySequenceLoad) throws IOException {
83+
if (!lazySequenceLoad) {
84+
return readFastaRNASequence(file);
85+
}
86+
87+
FastaReader<RNASequence, NucleotideCompound> fastaProxyReader =
88+
new FastaReader<RNASequence, NucleotideCompound>(
89+
file,
90+
new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
91+
new FileProxyRNASequenceCreator(
92+
file,
93+
RNACompoundSet.getRNACompoundSet(),
94+
new FastaSequenceParser()
95+
)
96+
);
97+
return fastaProxyReader.process();
98+
99+
}
100+
71101
/**
72102
* Read a fasta file containing amino acids with setup that would handle most
73103
* cases.
@@ -130,6 +160,35 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
130160
return dnaSequences;
131161
}
132162

163+
/**
164+
* Read a fasta RNA sequence
165+
* @param inStream
166+
* @return
167+
* @throws IOException
168+
*/
169+
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
170+
InputStream inStream) throws IOException {
171+
FastaReader<RNASequence, NucleotideCompound> fastaReader = new FastaReader<RNASequence, NucleotideCompound>(
172+
inStream,
173+
new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
174+
new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
175+
return fastaReader.process();
176+
}
177+
178+
/**
179+
*
180+
* @param file
181+
* @return
182+
* @throws IOException
183+
*/
184+
public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
185+
File file) throws IOException {
186+
FileInputStream inStream = new FileInputStream(file);
187+
LinkedHashMap<String, RNASequence> rnaSequences = readFastaRNASequence(inStream);
188+
inStream.close();
189+
return rnaSequences;
190+
}
191+
133192
public static void main(String[] args) throws Exception {
134193

135194
LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna"));
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* BioJava development code
3+
*
4+
* This code may be freely distributed and modified under the
5+
* terms of the GNU Lesser General Public Licence. This should
6+
* be distributed with the code. If you do not have a copy,
7+
* see:
8+
*
9+
* http://www.gnu.org/copyleft/lesser.html
10+
*
11+
* Copyright for this code is held jointly by the individual
12+
* authors. These should be listed in @author doc comments.
13+
*
14+
* For more information on the BioJava project and its aims,
15+
* or to join the biojava-l mailing list, visit the home page
16+
* at:
17+
*
18+
* http://www.biojava.org/
19+
*
20+
* Created on 01-21-2010
21+
*/
22+
package org.biojava.nbio.core.sequence.io;
23+
24+
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
25+
import org.biojava.nbio.core.sequence.RNASequence;
26+
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
27+
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
28+
import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
29+
import org.biojava.nbio.core.sequence.loader.SequenceFileProxyLoader;
30+
import org.biojava.nbio.core.sequence.template.AbstractSequence;
31+
import org.biojava.nbio.core.sequence.template.CompoundSet;
32+
import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
33+
34+
import java.io.File;
35+
import java.io.IOException;
36+
import java.util.List;
37+
38+
/**
39+
* This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
40+
* the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
41+
* This way you can load very large fasta files and store accession id and delay loading the sequence to save
42+
* memory. The index is the file stream offset so when a RNASequence has a call to getSequence() the
43+
* SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
44+
*
45+
* Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
46+
* interface to a remote server
47+
*
48+
* @author Scooter Willis <willishf at gmail dot com>
49+
*/
50+
public class FileProxyRNASequenceCreator implements
51+
SequenceCreatorInterface<NucleotideCompound> {
52+
53+
CompoundSet<NucleotideCompound> compoundSet = null;
54+
File file = null;
55+
SequenceParserInterface sequenceParser;
56+
57+
/**
58+
* Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
59+
* @param fastaFile
60+
* @param compoundSet
61+
*/
62+
public FileProxyRNASequenceCreator(File file,
63+
CompoundSet<NucleotideCompound> compoundSet,
64+
SequenceParserInterface sequenceParser) {
65+
this.compoundSet = compoundSet;
66+
this.file = file;
67+
this.sequenceParser = sequenceParser;
68+
}
69+
70+
/**
71+
* Even though we are passing in the sequence we really only care about the length of the sequence and the offset
72+
* index in the fasta file.
73+
* @param sequence
74+
* @param index
75+
* @return
76+
* @throws CompoundNotFoundException
77+
* @throws IOException
78+
*/
79+
@Override
80+
public AbstractSequence<NucleotideCompound> getSequence(String sequence, long index ) throws CompoundNotFoundException, IOException {
81+
SequenceFileProxyLoader<NucleotideCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<NucleotideCompound>(
82+
file,
83+
sequenceParser,
84+
index,
85+
sequence.length(),
86+
compoundSet);
87+
return new RNASequence(sequenceFileProxyLoader, compoundSet);
88+
}
89+
90+
/**
91+
* Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
92+
* @param proxyLoader
93+
* @param index
94+
* @return
95+
*/
96+
@Override
97+
public AbstractSequence<NucleotideCompound> getSequence(
98+
ProxySequenceReader<NucleotideCompound> proxyLoader, long index) {
99+
throw new UnsupportedOperationException("Not supported yet.");
100+
}
101+
102+
/**
103+
* Not sure of use case and currently not supported
104+
* @param list
105+
* @return
106+
*/
107+
@Override
108+
public AbstractSequence<NucleotideCompound> getSequence(
109+
List<NucleotideCompound> list) {
110+
throw new UnsupportedOperationException("Not supported yet.");
111+
}
112+
}

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReaderHelper.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323

2424
import org.biojava.nbio.core.sequence.DNASequence;
2525
import org.biojava.nbio.core.sequence.ProteinSequence;
26+
import org.biojava.nbio.core.sequence.RNASequence;
2627
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
2728
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
2829
import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
2930
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
31+
import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
3032
import org.biojava.nbio.core.sequence.template.AbstractSequence;
3133
import org.slf4j.Logger;
3234
import org.slf4j.LoggerFactory;
@@ -99,6 +101,35 @@ public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
99101
return GenbankProxyReader.process();
100102

101103
}
104+
105+
/**
106+
* Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
107+
* that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
108+
* in one sequence based on accession id.
109+
* @param file
110+
* @param lazySequenceLoad
111+
* @return
112+
* @throws Exception
113+
*/
114+
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception {
115+
if (!lazySequenceLoad) {
116+
return readGenbankRNASequence(file);
117+
}
118+
119+
GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader =
120+
new GenbankReader<RNASequence, NucleotideCompound>(
121+
file,
122+
new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
123+
new FileProxyRNASequenceCreator(
124+
file,
125+
RNACompoundSet.getRNACompoundSet(),
126+
new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
127+
)
128+
);
129+
return GenbankProxyReader.process();
130+
131+
}
132+
102133
/**
103134
* Read a Genbank file containing amino acids with setup that would handle most
104135
* cases.
@@ -160,6 +191,34 @@ public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
160191
inStream.close();
161192
return dnaSequences;
162193
}
194+
/**
195+
* Read a Genbank RNA sequence
196+
* @param inStream
197+
* @return
198+
* @throws Exception
199+
*/
200+
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
201+
InputStream inStream) throws Exception {
202+
GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<RNASequence, NucleotideCompound>(
203+
inStream,
204+
new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
205+
new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
206+
return GenbankReader.process();
207+
}
208+
209+
/**
210+
*
211+
* @param file
212+
* @return
213+
* @throws Exception
214+
*/
215+
public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
216+
File file) throws Exception {
217+
FileInputStream inStream = new FileInputStream(file);
218+
LinkedHashMap<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream);
219+
inStream.close();
220+
return rnaSequences;
221+
}
163222

164223
public static void main(String[] args) throws Exception {
165224

0 commit comments

Comments
 (0)