Skip to content

Commit c672d01

Browse files
authored
Merge pull request #1080 from hilbertglm/20240113-streaming
Added Java Streaming I/O for FASTA Files
2 parents c00ed2f + 84998b9 commit c672d01

File tree

19 files changed

+266
-31
lines changed

19 files changed

+266
-31
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
BioJava Changelog
22
-----------------
33

4+
BioJava 7.1.0 - future release
5+
==============================
6+
### Added
7+
* Class `FastaStreamer` to read FASTA-formatted files using Java streams
8+
9+
410
BioJava 7.0.2
511
==============================
612
### Added

biojava-aa-prop/pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<parent>
33
<artifactId>biojava</artifactId>
44
<groupId>org.biojava</groupId>
5-
<version>7.0.3-SNAPSHOT</version>
5+
<version>7.1.0-SNAPSHOT</version>
66
</parent>
77
<modelVersion>4.0.0</modelVersion>
88
<artifactId>biojava-aa-prop</artifactId>
@@ -70,12 +70,12 @@
7070
<dependency>
7171
<groupId>org.biojava</groupId>
7272
<artifactId>biojava-core</artifactId>
73-
<version>7.0.3-SNAPSHOT</version>
73+
<version>7.1.0-SNAPSHOT</version>
7474
</dependency>
7575
<dependency>
7676
<groupId>org.biojava</groupId>
7777
<artifactId>biojava-structure</artifactId>
78-
<version>7.0.3-SNAPSHOT</version>
78+
<version>7.1.0-SNAPSHOT</version>
7979
</dependency>
8080

8181
<!-- logging dependencies (managed by parent pom, don't set versions or scopes here) -->

biojava-alignment/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<artifactId>biojava</artifactId>
66
<groupId>org.biojava</groupId>
7-
<version>7.0.3-SNAPSHOT</version>
7+
<version>7.1.0-SNAPSHOT</version>
88
</parent>
99
<artifactId>biojava-alignment</artifactId>
1010
<name>biojava-alignment</name>
@@ -47,7 +47,7 @@
4747
<dependency>
4848
<groupId>org.biojava</groupId>
4949
<artifactId>biojava-core</artifactId>
50-
<version>7.0.3-SNAPSHOT</version>
50+
<version>7.1.0-SNAPSHOT</version>
5151
<scope>compile</scope>
5252
</dependency>
5353
<dependency>

biojava-core/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>biojava</artifactId>
55
<groupId>org.biojava</groupId>
6-
<version>7.0.3-SNAPSHOT</version>
6+
<version>7.1.0-SNAPSHOT</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99
<artifactId>biojava-core</artifactId>
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
package org.biojava.nbio.core.sequence.io;
2+
3+
import org.biojava.nbio.core.sequence.ProteinSequence;
4+
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
5+
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
6+
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
7+
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
8+
import org.biojava.nbio.core.util.InputStreamProvider;
9+
10+
import java.io.File;
11+
import java.io.IOException;
12+
import java.io.InputStream;
13+
import java.io.UncheckedIOException;
14+
import java.nio.file.Path;
15+
import java.util.Collections;
16+
import java.util.Iterator;
17+
import java.util.LinkedHashMap;
18+
import java.util.Map;
19+
import java.util.Optional;
20+
import java.util.Spliterator;
21+
import java.util.Spliterators;
22+
import java.util.function.Consumer;
23+
import java.util.stream.Stream;
24+
import java.util.stream.StreamSupport;
25+
26+
/**
27+
* Read from a FASTA file (or gzipped FASTA file) and create a Java stream of {@link ProteinSequence} objects
28+
* for use in a functional programming paradigm.
29+
*
30+
* @author Gary Murphy
31+
* @since 7.1.0
32+
*/
33+
public class FastaStreamer {
34+
35+
private final Path path;
36+
private int batchSize = 1_000;
37+
private SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser;
38+
private SequenceCreatorInterface<AminoAcidCompound> sequenceCreator;
39+
private LinkedHashMap<String, ProteinSequence> chunk = new LinkedHashMap<>();
40+
private Iterator<Map.Entry<String, ProteinSequence>> iterator = Collections.emptyIterator();
41+
private boolean closed = false;
42+
43+
/**
44+
* The constructor is private. Created via the <tt>from(...)</tt> static factory method
45+
*
46+
* @param path the path to the file containing the FASTA content (possibly GZipped)
47+
*/
48+
private FastaStreamer(final Path path) {
49+
this.path = path;
50+
}
51+
52+
public static FastaStreamer from(final Path path) {
53+
return new FastaStreamer(path);
54+
}
55+
56+
public static FastaStreamer from(File file) {
57+
return from(file.toPath());
58+
}
59+
60+
public FastaStreamer withHeaderParser(SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser) {
61+
this.headerParser = headerParser;
62+
return this;
63+
}
64+
65+
public FastaStreamer withSequenceCreator(SequenceCreatorInterface<AminoAcidCompound> sequenceCreator) {
66+
this.sequenceCreator = sequenceCreator;
67+
return this;
68+
}
69+
70+
public FastaStreamer batchSize(int size) {
71+
this.batchSize = size;
72+
return this;
73+
}
74+
75+
/**
76+
* Enable iteration through the proteins in the file using syntax such as:
77+
* <pre>
78+
* for(ProteinSequence sequence : FastaStreamer.from(path).each()) {
79+
* .
80+
* .
81+
* .
82+
* }
83+
* </pre>
84+
*
85+
* @return an iterable suitable for an iteration loop
86+
*/
87+
public Iterable<ProteinSequence> each() {
88+
return () -> stream().iterator();
89+
}
90+
91+
/**
92+
* Create a stream of protein sequences from the contents of the path
93+
* @return the stream
94+
*/
95+
public Stream<ProteinSequence> stream() {
96+
InputStreamProvider provider = new InputStreamProvider();
97+
InputStream input;
98+
try {
99+
input = provider.getInputStream(getPath().toFile());
100+
} catch (IOException exception) {
101+
throw new UncheckedIOException(exception);
102+
}
103+
FastaReader<ProteinSequence, AminoAcidCompound> reader = new FastaReader<>(input, getHeaderParser(), getSequenceCreator());
104+
Spliterator<ProteinSequence> source = new Spliterators.AbstractSpliterator<>(Integer.MAX_VALUE, Spliterator.IMMUTABLE | Spliterator.NONNULL) {
105+
@Override
106+
public boolean tryAdvance(Consumer<? super ProteinSequence> action) {
107+
if (closed) {
108+
return false;
109+
}
110+
ProteinSequence protein = next(reader);
111+
if (null == protein) {
112+
return false;
113+
}
114+
action.accept(protein);
115+
return true;
116+
}
117+
118+
/**
119+
* Fetch the next header/protein tuple from the cache. If the cache is empty, fetch another
120+
* batch from the source file
121+
*
122+
* @param reader
123+
* the input stream from which the FASTA content is read
124+
* @return the protein sequence
125+
*/
126+
private ProteinSequence next(FastaReader<ProteinSequence, AminoAcidCompound> reader) {
127+
try {
128+
if (!iterator.hasNext()) {
129+
chunk = reader.process(getBatchSize());
130+
if (null == chunk) {
131+
closed = true;
132+
reader.close();
133+
return null;
134+
}
135+
iterator = chunk.entrySet().iterator();
136+
}
137+
if (iterator.hasNext()) {
138+
Map.Entry<String, ProteinSequence> entry = iterator.next();
139+
return createSequence(entry.getValue());
140+
}
141+
closed = true;
142+
reader.close();
143+
} catch (IOException exception) {
144+
throw new UncheckedIOException(String.format("I/O error reading the FASTA file from '%s'", getPath()), exception);
145+
}
146+
return null;
147+
}
148+
}; // Spliterator
149+
return StreamSupport.stream(source, false);
150+
}
151+
152+
/**
153+
* Create the sequence with the information from the header. This implementation return the sequence as-is, but
154+
* this is an opportunity for the implementer to build specific information into the user collection space
155+
* of the sequence
156+
*
157+
* @param sequence the protein sequence
158+
* @return the sequence
159+
*/
160+
protected ProteinSequence createSequence(ProteinSequence sequence) {
161+
return sequence;
162+
}
163+
164+
protected Path getPath() {
165+
return path;
166+
}
167+
168+
protected int getBatchSize() {
169+
return batchSize;
170+
}
171+
172+
protected SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> getHeaderParser() {
173+
return Optional.ofNullable(headerParser).orElse(new GenericFastaHeaderParser<>());
174+
}
175+
176+
public SequenceCreatorInterface<AminoAcidCompound> getSequenceCreator() {
177+
return Optional.ofNullable(sequenceCreator).orElse(new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
178+
}
179+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package org.biojava.nbio.core.sequence.io;
2+
3+
import org.biojava.nbio.core.sequence.ProteinSequence;
4+
import org.junit.Assert;
5+
import org.junit.Test;
6+
7+
import java.io.IOException;
8+
import java.nio.file.Path;
9+
import java.nio.file.Paths;
10+
import java.util.List;
11+
import java.util.stream.Collectors;
12+
13+
/**
14+
* Test the functionality of the {@link FastaStreamer} code
15+
*/
16+
public class FastaStreamerTest {
17+
18+
@Test
19+
public void stream() throws IOException {
20+
String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
21+
Path path = Paths.get(file);
22+
List<ProteinSequence> sequences;
23+
24+
sequences = FastaStreamer.from(path).stream().collect(Collectors.toList());
25+
Assert.assertEquals("Count", 283, sequences.size());
26+
27+
ProteinSequence sequence;
28+
sequence = sequences.get(0);
29+
Assert.assertEquals("A2D504_ATEGE/1-46", sequence.getOriginalHeader());
30+
sequence = sequences.get(sequences.size()-1);
31+
Assert.assertEquals("Q98SJ1_CHICK/15-61", sequence.getOriginalHeader());
32+
33+
sequences = FastaStreamer.from(path)
34+
.batchSize(2) // Ensure there isn't an edge condition loading the next buffer
35+
.stream()
36+
.collect(Collectors.toList());
37+
Assert.assertEquals("Count", 283, sequences.size());
38+
}
39+
40+
@Test
41+
public void iterate() {
42+
String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
43+
Path path = Paths.get(file);
44+
int count = 0;
45+
for (ProteinSequence sequence : FastaStreamer.from(path).each()) {
46+
count++;
47+
}
48+
Assert.assertEquals("Count", 283, count);
49+
}
50+
}
Binary file not shown.
Binary file not shown.

biojava-genome/pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<parent>
44
<artifactId>biojava</artifactId>
55
<groupId>org.biojava</groupId>
6-
<version>7.0.3-SNAPSHOT</version>
6+
<version>7.1.0-SNAPSHOT</version>
77
</parent>
88
<modelVersion>4.0.0</modelVersion>
99
<artifactId>biojava-genome</artifactId>
@@ -70,13 +70,13 @@
7070
<dependency>
7171
<groupId>org.biojava</groupId>
7272
<artifactId>biojava-core</artifactId>
73-
<version>7.0.3-SNAPSHOT</version>
73+
<version>7.1.0-SNAPSHOT</version>
7474
<scope>compile</scope>
7575
</dependency>
7676
<dependency>
7777
<groupId>org.biojava</groupId>
7878
<artifactId>biojava-alignment</artifactId>
79-
<version>7.0.3-SNAPSHOT</version>
79+
<version>7.1.0-SNAPSHOT</version>
8080
<scope>compile</scope>
8181
</dependency>
8282
<dependency>

biojava-integrationtest/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<parent>
55
<artifactId>biojava</artifactId>
66
<groupId>org.biojava</groupId>
7-
<version>7.0.3-SNAPSHOT</version>
7+
<version>7.1.0-SNAPSHOT</version>
88
</parent>
99
<artifactId>biojava-integrationtest</artifactId>
1010
<packaging>jar</packaging>
@@ -40,7 +40,7 @@
4040
<dependency>
4141
<groupId>org.biojava</groupId>
4242
<artifactId>biojava-structure</artifactId>
43-
<version>7.0.3-SNAPSHOT</version>
43+
<version>7.1.0-SNAPSHOT</version>
4444
</dependency>
4545
<!-- logging dependencies (managed by parent pom, don't set versions or scopes here) -->
4646
<dependency>

0 commit comments

Comments
 (0)