2727import java .io .IOException ;
2828import java .io .InputStream ;
2929import java .io .InputStreamReader ;
30+ import java .util .HashMap ;
3031import java .util .LinkedHashMap ;
3132
3233import org .biojava3 .core .sequence .ProteinSequence ;
4041/**
4142 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the
4243 * primary class used to read Fasta files
43- * @author Scooter Willis < willishf at gmail dot com>
44+ * @author Scooter Willis ;lt; willishf at gmail dot com>
4445 */
4546public class FastaReader <S extends Sequence <?>, C extends Compound > {
4647
@@ -49,7 +50,11 @@ public class FastaReader<S extends Sequence<?>, C extends Compound> {
4950 BufferedReaderBytesRead br ;
5051 InputStreamReader isr ;
5152 FileInputStream fi = null ;
52-
53+ long fileIndex = 0 ;
54+ long sequenceIndex = 0 ;
55+ String line = "" ;
56+ String header = "" ;
57+
5358 /**
5459 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
5560 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
@@ -67,7 +72,8 @@ public FastaReader(InputStream is, FastaHeaderParserInterface<S,C> headerParser,
6772 }
6873
6974 /**
70- * If you are going to use the FileProxyProteinSequenceCreator then you need to use this constructor because we need details about
75+ * If you are going to use the FileProxyProteinSequenceCreator then you
76+ * need to use this constructor because we need details about
7177 * the location of the file.
7278 * @param file
7379 * @param headerParser
@@ -88,35 +94,74 @@ public FastaReader(File file, FastaHeaderParserInterface<S,C> headerParser,
8894 }
8995
9096 /**
91- * The parsing is done in this method
92- * @return
97+ * The parsing is done in this method.<br>
98+ * This method tries to process all the available fasta records
99+ * in the File or InputStream, closes the underlying resource,
100+ * and return the results in {@link LinkedHashMap}.<br>
101+ * You don't need to call {@link #close()} after calling this method.
102+ * @see #process(int)
103+ * @return {@link HashMap} containing all the parsed fasta records
104+ * present, starting current fileIndex onwards.
93105 * @throws IOException if an error occurs reading the input file
94106 */
95- @ SuppressWarnings ("unchecked" )
96107 public LinkedHashMap <String ,S > process () throws IOException {
97- LinkedHashMap <String ,S > sequences = new LinkedHashMap <String ,S >();
108+ LinkedHashMap <String ,S > sequences = process (-1 );
109+ close ();
110+ return sequences ;
111+ }
98112
113+ /**
114+ * This method tries to parse maximum <code>max</code> records from
115+ * the open File or InputStream, and leaves the underlying resource open.<br>
116+ * Subsequent calls to the same method continue parsing the rest of the file.<br>
117+ * This is particularly useful when dealing with very big data files,
118+ * (e.g. NCBI nr database), which can't fit into memory and will take long
119+ * time before the first result is available.<br>
120+ * <b>N.B.</b>
121+ * <ul>
122+ * <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
123+ * <li>remember to close the underlying resource when you are done.</li>
124+ * </ul>
125+ * @see #process()
126+ * @author Amr AL-Hossary
127+ * @since 3.0.6
128+ * @param max maximum number of records to return, <code>-1</code> for infinity.
129+ * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records
130+ * present, starting current fileIndex onwards.
131+ * @throws IOException if an error occurs reading the input file
132+ */
133+ public LinkedHashMap <String ,S > process (int max ) throws IOException {
134+ LinkedHashMap <String ,S > sequences = new LinkedHashMap <String ,S >();
99135
100136 String line = "" ;
137+ if (this .line != null && this .line .length () > 0 ){
138+ line =this .line ;
139+ }
101140 String header = "" ;
141+ if (this .header != null && this .header .length () > 0 ){
142+ header =this .header ;
143+ }
144+
102145 StringBuilder sb = new StringBuilder ();
103- int maxSequenceLength = -1 ;
104- long fileIndex = 0 ;
105- long sequenceIndex = 0 ;
146+ int processedSequences =0 ;
106147 boolean keepGoing = true ;
148+
107149 do {
108150 line = line .trim (); // nice to have but probably not needed
109151 if (line .length () != 0 ) {
110- if (line .startsWith (">" )) {
111- if (sb .length () > 0 ) {
152+ if (line .startsWith (">" )) {//start of new fasta record
153+ if (sb .length () > 0 ) {//i.e. if there is already a sequence before
112154 // System.out.println("Sequence index=" + sequenceIndex);
155+ @ SuppressWarnings ("unchecked" )
113156 S sequence = (S )sequenceCreator .getSequence (sb .toString (), sequenceIndex );
114157 headerParser .parseHeader (header , sequence );
115158 sequences .put (sequence .getAccession ().getID (),sequence );
116- if (maxSequenceLength < sb .length ()) {
117- maxSequenceLength = sb .length ();
118- }
119- sb = new StringBuilder (maxSequenceLength );
159+ processedSequences ++;
160+ // if (maxSequenceLength < sb.length()) {
161+ // maxSequenceLength = sb.length();
162+ // }
163+ // sb = new StringBuilder(maxSequenceLength);
164+ sb .setLength (0 ); //this is faster, better memory utilization (same buffer)
120165 }
121166 header = line .substring (1 );
122167 } else if (line .startsWith (";" )) {
@@ -130,22 +175,33 @@ public LinkedHashMap<String,S> process() throws IOException {
130175 }
131176 fileIndex = br .getBytesRead ();
132177 line = br .readLine ();
133- if (line == null ) {
134- // System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
178+ if (line == null ) {//i.e. EOF
179+ @ SuppressWarnings ("unchecked" )
180+ // System.out.println("Sequence index=" + sequenceIndex + " " + fileIndex );
135181 S sequence = (S )sequenceCreator .getSequence (sb .toString (), sequenceIndex );
136182 headerParser .parseHeader (header , sequence );
137183 sequences .put (sequence .getAccession ().getID (),sequence );
184+ processedSequences ++;
138185 keepGoing = false ;
139186 }
187+ if (max > -1 && processedSequences >=max ) {
188+ keepGoing =false ;
189+ }
140190 } while (keepGoing );
141- br .close ();
191+ this .line = line ;
192+ this .header = header ;
193+ return sequences ;
194+ }
195+
196+ public void close () throws IOException {
197+ br .close ();
142198 isr .close ();
143199 //If stream was created from File object then we need to close it
144200 if (fi != null ) {
145201 fi .close ();
146202 }
147- return sequences ;
148- }
203+ this . line = this . header = null ;
204+ }
149205
150206 public static void main (String [] args ) {
151207 try {
0 commit comments