4343 */
4444public class FastaReader <S extends Sequence <?>, C extends Compound > {
4545
46- private final static Logger logger = LoggerFactory .getLogger (FastaReader .class );
46+ private final static Logger logger = LoggerFactory .getLogger (FastaReader .class );
4747
4848 SequenceCreatorInterface <C > sequenceCreator ;
4949 SequenceHeaderParserInterface <S ,C > headerParser ;
@@ -54,17 +54,17 @@ public class FastaReader<S extends Sequence<?>, C extends Compound> {
5454 long sequenceIndex = 0 ;
5555 String line = "" ;
5656 String header = "" ;
57-
57+
5858 /**
5959 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
6060 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
6161 * an inputstream is forced to read all the data so you don't gain anything.
62- * @param br
62+ * @param is inputStream
6363 * @param headerParser
6464 * @param sequenceCreator
6565 */
6666 public FastaReader (InputStream is , SequenceHeaderParserInterface <S ,C > headerParser ,
67- SequenceCreatorInterface <C > sequenceCreator ) {
67+ SequenceCreatorInterface <C > sequenceCreator ) {
6868 this .headerParser = headerParser ;
6969 isr = new InputStreamReader (is );
7070 this .br = new BufferedReaderBytesRead (isr );
@@ -85,7 +85,7 @@ public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerPars
8585 * method denies read access to the file.
8686 */
8787 public FastaReader (File file , SequenceHeaderParserInterface <S ,C > headerParser ,
88- SequenceCreatorInterface <C > sequenceCreator ) throws FileNotFoundException {
88+ SequenceCreatorInterface <C > sequenceCreator ) throws FileNotFoundException {
8989 this .headerParser = headerParser ;
9090 fi = new FileInputStream (file );
9191 isr = new InputStreamReader (fi );
@@ -105,9 +105,10 @@ public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
105105 * @throws IOException if an error occurs reading the input file
106106 */
107107 public LinkedHashMap <String ,S > process () throws IOException {
108- LinkedHashMap <String ,S > sequences = process (-1 );
109- close ();
110- return sequences ;
108+ LinkedHashMap <String ,S > sequences = process (-1 );
109+ close ();
110+
111+ return sequences ;
111112 }
112113
113114 /**
@@ -130,45 +131,47 @@ public LinkedHashMap<String,S> process() throws IOException {
130131 * present, starting current fileIndex onwards.
131132 * @throws IOException if an error occurs reading the input file
132133 */
133- public LinkedHashMap <String ,S > process (int max ) throws IOException {
134- LinkedHashMap < String , S > sequences = new LinkedHashMap < String , S >();
134+ public LinkedHashMap <String ,S > process (int max ) throws IOException {
135+
135136
136137 String line = "" ;
137138 if (this .line != null && this .line .length () > 0 ){
138- line =this .line ;
139+ line =this .line ;
139140 }
140141 String header = "" ;
141142 if (this .header != null && this .header .length () > 0 ){
142- header =this .header ;
143+ header =this .header ;
143144 }
144-
145+
145146 StringBuilder sb = new StringBuilder ();
146147 int processedSequences =0 ;
147148 boolean keepGoing = true ;
148149
150+
151+ LinkedHashMap <String ,S > sequences = new LinkedHashMap <String ,S >();
152+
149153 do {
150154 line = line .trim (); // nice to have but probably not needed
151155 if (line .length () != 0 ) {
152156 if (line .startsWith (">" )) {//start of new fasta record
153- if (sb .length () > 0 ) {//i.e. if there is already a sequence before
154- // logger.debug("Sequence index=" + sequenceIndex);
155-
156- try {
157- @ SuppressWarnings ("unchecked" )
158- S sequence = (S )sequenceCreator .getSequence (sb .toString (), sequenceIndex );
157+
158+ if (sb .length () > 0 ) {
159+ //i.e. if there is already a sequence before
160+ //logger.info("Sequence index=" + sequenceIndex);
161+
162+ try {
163+ @ SuppressWarnings ("unchecked" )
164+ S sequence = (S )sequenceCreator .getSequence (sb .toString (), sequenceIndex );
159165 headerParser .parseHeader (header , sequence );
160166 sequences .put (sequence .getAccession ().getID (),sequence );
161167 processedSequences ++;
162168
163- } catch (CompoundNotFoundException e ) {
164- logger .warn ("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored" ,
165- header , e .getMessage ());
166- }
167- // if (maxSequenceLength < sb.length()) {
168- // maxSequenceLength = sb.length();
169- // }
170- // sb = new StringBuilder(maxSequenceLength);
171- sb .setLength (0 ); //this is faster, better memory utilization (same buffer)
169+ } catch (CompoundNotFoundException e ) {
170+ logger .warn ("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored" ,
171+ header , e .getMessage ());
172+ }
173+
174+ sb .setLength (0 ); //this is faster than allocating new buffers, better memory utilization (same buffer)
172175 }
173176 header = line .substring (1 );
174177 } else if (line .startsWith (";" )) {
@@ -181,68 +184,85 @@ public LinkedHashMap<String,S> process(int max) throws IOException {
181184 }
182185 }
183186 fileIndex = br .getBytesRead ();
187+
184188 line = br .readLine ();
185- if (line == null ) {//i.e. EOF
189+
190+ if (line == null ) {
191+
192+
193+ // Fix for #282
194+ if ( sequences .size () == 0 && max != -1 ) {
195+ return null ;
196+ }
197+
198+ //i.e. EOF
186199 String seq = sb .toString ();
187200 if ( seq .length () == 0 ) {
188201 logger .warn ("Can't parse sequence {}. Got sequence of length 0!" , sequenceIndex );
189202 logger .warn ("header: {}" , header );
190203 }
191- // logger.debug ("Sequence index=" + sequenceIndex + " " + fileIndex );
204+ //logger.info ("Sequence index=" + sequenceIndex + " " + fileIndex );
192205 try {
193- @ SuppressWarnings ("unchecked" )
194- S sequence = (S )sequenceCreator .getSequence (seq , sequenceIndex );
195- headerParser .parseHeader (header , sequence );
196- sequences .put (sequence .getAccession ().getID (),sequence );
197- processedSequences ++;
206+ @ SuppressWarnings ("unchecked" )
207+ S sequence = (S )sequenceCreator .getSequence (seq , sequenceIndex );
208+ headerParser .parseHeader (header , sequence );
209+ sequences .put (sequence .getAccession ().getID (),sequence );
210+ processedSequences ++;
198211 } catch (CompoundNotFoundException e ) {
199- logger .warn ("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored" ,
200- header , e .getMessage ());
201- }
212+ logger .warn ("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored" ,
213+ header , e .getMessage ());
214+ }
202215 keepGoing = false ;
203216 }
204- if (max > -1 && processedSequences >=max ) {
205- keepGoing =false ;
206- }
217+ if (max > -1 && processedSequences >=max ) {
218+ keepGoing =false ;
219+ }
220+ if ( this .line == null )
221+ keepGoing = false ;
207222 } while (keepGoing );
223+
208224 this .line = line ;
209225 this .header = header ;
226+
210227 return sequences ;
211228 }
212229
213- public void close () throws IOException {
214- br .close ();
230+ public void close () throws IOException {
231+ br .close ();
215232 isr .close ();
216233 //If stream was created from File object then we need to close it
217234 if (fi != null ) {
218235 fi .close ();
219236 }
220237 this .line =this .header = null ;
221- }
238+ }
222239
223240 public static void main (String [] args ) {
224241 try {
225- String inputFile = "src/test/resources/PF00104_small.fasta" ;
226- FileInputStream is = new FileInputStream (inputFile );
242+ String inputFile = "/PF00104_small.fasta" ;
243+ InputStream is = FastaReader .class .getResourceAsStream (inputFile );
244+
227245
246+ if ( is == null )
247+ System .err .println ("Could not get input file " + inputFile );
228248 FastaReader <ProteinSequence , AminoAcidCompound > fastaReader = new FastaReader <ProteinSequence , AminoAcidCompound >(is , new GenericFastaHeaderParser <ProteinSequence ,AminoAcidCompound >(), new ProteinSequenceCreator (AminoAcidCompoundSet .getAminoAcidCompoundSet ()));
229249 LinkedHashMap <String ,ProteinSequence > proteinSequences = fastaReader .process ();
230250 is .close ();
231251
232252
233- logger .info ("Protein Sequences: {}" , proteinSequences );
253+ // logger.info("Protein Sequences: {}", proteinSequences);
234254
235255 File file = new File (inputFile );
236- FastaReader <ProteinSequence ,AminoAcidCompound > fastaProxyReader =
237- new FastaReader <ProteinSequence ,AminoAcidCompound >(
238- file ,
239- new GenericFastaHeaderParser <ProteinSequence ,AminoAcidCompound >(),
240- new FileProxyProteinSequenceCreator (
241- file ,
242- AminoAcidCompoundSet .getAminoAcidCompoundSet (),
243- new FastaSequenceParser ()
244- )
245- );
256+ FastaReader <ProteinSequence ,AminoAcidCompound > fastaProxyReader =
257+ new FastaReader <ProteinSequence ,AminoAcidCompound >(
258+ file ,
259+ new GenericFastaHeaderParser <ProteinSequence ,AminoAcidCompound >(),
260+ new FileProxyProteinSequenceCreator (
261+ file ,
262+ AminoAcidCompoundSet .getAminoAcidCompoundSet (),
263+ new FastaSequenceParser ()
264+ )
265+ );
246266 LinkedHashMap <String ,ProteinSequence > proteinProxySequences = fastaProxyReader .process ();
247267
248268 for (String key : proteinProxySequences .keySet ()){
0 commit comments