Skip to content

Commit abe7d98

Browse files
committed
Added process(int max) which tries to parse maximum max records from the
open File or InputStream, and leaves the underlying resource open. Subsequent calls to the same method resumes parsing. This is particularly useful when dealing with very big data files, (e.g. NCBI nr database), which can't fit into memory and will take long time before the first result is available. Signed-off-by: aalhossary <amr_alhossary@hotmail.com>
1 parent 0fe3705 commit abe7d98

2 files changed

Lines changed: 134 additions & 21 deletions

File tree

biojava3-core/src/main/java/org/biojava3/core/sequence/io/FastaReader.java

Lines changed: 77 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.io.IOException;
2828
import java.io.InputStream;
2929
import java.io.InputStreamReader;
30+
import java.util.HashMap;
3031
import java.util.LinkedHashMap;
3132

3233
import org.biojava3.core.sequence.ProteinSequence;
@@ -40,7 +41,7 @@
4041
/**
4142
* Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
4243
* primary class used to read Fasta files
43-
* @author Scooter Willis <willishf at gmail dot com>
44+
* @author Scooter Willis ;lt;willishf at gmail dot com&gt;
4445
*/
4546
public class FastaReader<S extends Sequence<?>, C extends Compound> {
4647

@@ -49,7 +50,11 @@ public class FastaReader<S extends Sequence<?>, C extends Compound> {
4950
BufferedReaderBytesRead br;
5051
InputStreamReader isr;
5152
FileInputStream fi = null;
52-
53+
long fileIndex = 0;
54+
long sequenceIndex = 0;
55+
String line = "";
56+
String header= "";
57+
5358
/**
5459
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
5560
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
@@ -67,7 +72,8 @@ public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser,
6772
}
6873

6974
/**
70-
* If you are going to use the FileProxyProteinSequenceCreator then you need to use this constructor because we need details about
75+
* If you are going to use the FileProxyProteinSequenceCreator then you
76+
* need to use this constructor because we need details about
7177
* the location of the file.
7278
* @param file
7379
* @param headerParser
@@ -88,35 +94,74 @@ public FastaReader(File file, FastaHeaderParserInterface<S,C> headerParser,
8894
}
8995

9096
/**
91-
* The parsing is done in this method
92-
* @return
97+
* The parsing is done in this method.<br>
98+
* This method tries to process all the available fasta records
99+
* in the File or InputStream, closes the underlying resource,
100+
* and return the results in {@link LinkedHashMap}.<br>
101+
* You don't need to call {@link #close()} after calling this method.
102+
* @see #process(int)
103+
* @return {@link HashMap} containing all the parsed fasta records
104+
* present, starting current fileIndex onwards.
93105
* @throws IOException if an error occurs reading the input file
94106
*/
95-
@SuppressWarnings("unchecked")
96107
public LinkedHashMap<String,S> process() throws IOException {
97-
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
108+
LinkedHashMap<String,S> sequences = process(-1);
109+
close();
110+
return sequences;
111+
}
98112

113+
/**
114+
* This method tries to parse maximum <code>max</code> records from
115+
* the open File or InputStream, and leaves the underlying resource open.<br>
116+
* Subsequent calls to the same method continue parsing the rest of the file.<br>
117+
* This is particularly useful when dealing with very big data files,
118+
* (e.g. NCBI nr database), which can't fit into memory and will take long
119+
* time before the first result is available.<br>
120+
* <b>N.B.</b>
121+
* <ul>
122+
* <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
123+
* <li>remember to close the underlying resource when you are done.</li>
124+
* </ul>
125+
* @see #process()
126+
* @author Amr AL-Hossary
127+
* @since 3.0.6
128+
* @param max maximum number of records to return, <code>-1</code> for infinity.
129+
* @return {@link HashMap} containing maximum <code>max</code> parsed fasta records
130+
* present, starting current fileIndex onwards.
131+
* @throws IOException if an error occurs reading the input file
132+
*/
133+
public LinkedHashMap<String,S> process(int max) throws IOException {
134+
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
99135

100136
String line = "";
137+
if(this.line != null && this.line.length() > 0){
138+
line=this.line;
139+
}
101140
String header = "";
141+
if(this.header != null && this.header.length() > 0){
142+
header=this.header;
143+
}
144+
102145
StringBuilder sb = new StringBuilder();
103-
int maxSequenceLength = -1;
104-
long fileIndex = 0;
105-
long sequenceIndex = 0;
146+
int processedSequences=0;
106147
boolean keepGoing = true;
148+
107149
do {
108150
line = line.trim(); // nice to have but probably not needed
109151
if (line.length() != 0) {
110-
if (line.startsWith(">")) {
111-
if (sb.length() > 0) {
152+
if (line.startsWith(">")) {//start of new fasta record
153+
if (sb.length() > 0) {//i.e. if there is already a sequence before
112154
// System.out.println("Sequence index=" + sequenceIndex);
155+
@SuppressWarnings("unchecked")
113156
S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
114157
headerParser.parseHeader(header, sequence);
115158
sequences.put(sequence.getAccession().getID(),sequence);
116-
if (maxSequenceLength < sb.length()) {
117-
maxSequenceLength = sb.length();
118-
}
119-
sb = new StringBuilder(maxSequenceLength);
159+
processedSequences++;
160+
// if (maxSequenceLength < sb.length()) {
161+
// maxSequenceLength = sb.length();
162+
// }
163+
// sb = new StringBuilder(maxSequenceLength);
164+
sb.setLength(0); //this is faster, better memory utilization (same buffer)
120165
}
121166
header = line.substring(1);
122167
} else if (line.startsWith(";")) {
@@ -130,22 +175,33 @@ public LinkedHashMap<String,S> process() throws IOException {
130175
}
131176
fileIndex = br.getBytesRead();
132177
line = br.readLine();
133-
if (line == null) {
134-
// System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
178+
if (line == null) {//i.e. EOF
179+
@SuppressWarnings("unchecked")
180+
// System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
135181
S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
136182
headerParser.parseHeader(header, sequence);
137183
sequences.put(sequence.getAccession().getID(),sequence);
184+
processedSequences++;
138185
keepGoing = false;
139186
}
187+
if (max > -1 && processedSequences>=max) {
188+
keepGoing=false;
189+
}
140190
} while (keepGoing);
141-
br.close();
191+
this.line = line;
192+
this.header= header;
193+
return sequences;
194+
}
195+
196+
public void close() throws IOException {
197+
br.close();
142198
isr.close();
143199
//If stream was created from File object then we need to close it
144200
if (fi != null) {
145201
fi.close();
146202
}
147-
return sequences;
148-
}
203+
this.line=this.header = null;
204+
}
149205

150206
public static void main(String[] args) {
151207
try {

biojava3-core/src/test/java/org/biojava3/core/sequence/io/FastaReaderTest.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,61 @@ public void testProcess() throws Exception {
8686
assertEquals(seqNum,283);
8787
}
8888

89+
@Test
90+
public void processIntTest() throws Exception {
91+
System.out.println("process(int)");
92+
InputStream inStream = this.getClass().getResourceAsStream("/PF00104_small.fasta");
93+
assertNotNull(inStream);
94+
FastaReader<ProteinSequence,AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence,AminoAcidCompound>(inStream, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
95+
LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process(200);
96+
97+
//Should have 200 sequences
98+
//System.out.println("Expecting 200 got " + proteinSequences.size());
99+
assertEquals(proteinSequences.size() , 200 );
100+
101+
int seqNum = 0;
102+
for(String id:proteinSequences.keySet()) {
103+
ProteinSequence proteinSequence = proteinSequences.get(id);
104+
switch(seqNum) {
105+
case 0:
106+
assertEquals(proteinSequence.getAccession().getID(),"A2D504_ATEGE/1-46");
107+
assertEquals(proteinSequence.getSequenceAsString(),"-----------------FK-N----LP-LED----------------Q----ITL--IQY-----------SWM----------------------CL-SSFA------LSWRSYK---HTNSQFLYFAPDLVF-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
108+
break;
109+
case 199:
110+
assertEquals(proteinSequence.getAccession().getID(),"Q5F0P7_HUMAN/248-428");
111+
assertEquals(proteinSequence.getSequenceAsString(),"DRELVVIIGWAKHI-PGFS-S----LS-LGD----------------Q----MSL--LQS-----------AWM----------------------EI-LILG------IVYRSLP---YDDKLVYAEDYIMD-------------------------------------------------------------------------------------------------------------------------------------------------------------EEHSRLAGLLELYRAILQLV-RRY-KKL-K-VEKEEF--------------------VTLKA-LALA-NSDSMY-------------------------------------------IEDL--EAVQKLQDLLHEALQD-Y-----ELS---------QR---HE----------------------------EP------W--------RTG-KLLLTLPL-LRQTA-----------------AKA-VQHF--YSVKLQGKV--PMH--KLF-------LEM---");
112+
break;
113+
}
114+
seqNum++;
115+
}
116+
assertEquals(seqNum,200);
117+
118+
//Should have 83 sequences
119+
proteinSequences = fastaReader.process(200);
120+
assertEquals(proteinSequences.size() , 83 );
121+
seqNum = 0;
122+
for(String id:proteinSequences.keySet()) {
123+
ProteinSequence proteinSequence = proteinSequences.get(id);
124+
switch(seqNum) {
125+
case 0:
126+
assertEquals(proteinSequence.getAccession().getID(),"RARA_CANFA/233-413");
127+
assertEquals(proteinSequence.getSequenceAsString(), "TKCIIKTVEFAKQL-PGFT-T----LT-IAD----------------Q----ITL--LKA-----------ACL----------------------DI-LILR------ICTRYTP---EQDTMTFSEGLTLN-------------------------------------------------------------------------------------------------------------------------------------------------------------RTQMHKAGFGPLTDLVFAFA-NQL-LPL-E-MDDAET--------------------GLLSA-ICLI-CGDRQD-------------------------------------------LEQP--DRVDMLQEPLLEALKV-Y-----VRK---------RR---PS----------------------------RP------H--------MFP-KMLMKITD-LRSIS-----------------AKG-AERV--ITLKMEIPG--SMP--PLI-------QEM---");
128+
break;
129+
case 81:
130+
//System.out.println(proteinSequence.getAccession());
131+
//System.out.println(proteinSequence.getSequenceAsString());
132+
assertEquals(proteinSequence.getAccession().getID(),"Q9PU76_CRONI/141-323");
133+
assertEquals(proteinSequence.getSequenceAsString(),"VETVTELTEFAKSI-PGFS-N----LD-LND----------------Q----VTL--LKY-----------GVY----------------------EA-IFAM------LASVMNK---DGMPVAYGNGFITRE------------------------------------------------------------------------------------------------------------------------------------------------------------FLKSLRKPFCDIMEPKFDFA-MKF-NSL-E-LDDSDI--------------------SLFVA-AIIC-CGDRPG-------------------------------------------LVNV--GHIEKMQESIVHVLKL-H-----LQN---------NH---PD----------------------------DI------F--------LFP-KLLQKMAD-LRQLV-----------------TEH-AQLV--QIIKK---TESDAHLHPLL-------QEI---");
134+
break;
135+
case 82:
136+
assertEquals(proteinSequence.getAccession().getID(),"Q98SJ1_CHICK/15-61");
137+
assertEquals(proteinSequence.getSequenceAsString(),"---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------Q-----------------NW------Q--------RFY-QLTKLLDS-MHDVV-----------------ENL-LSFC--FQTFLDKSM--SIEFPEML-------AEI---");
138+
break;
139+
}
140+
seqNum++;
141+
}
142+
assertEquals(seqNum,83);
143+
fastaReader.close();
144+
inStream.close();
145+
}
89146
}

0 commit comments

Comments
 (0)