Skip to content

Commit 96c8126

Browse files
committed
Before finding the InputStreamProvider
1 parent f45e018 commit 96c8126

File tree

5 files changed

+255
-0
lines changed

5 files changed

+255
-0
lines changed
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package org.biojava.nbio.core.sequence.io;
2+
3+
import org.biojava.nbio.core.sequence.ProteinSequence;
4+
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
5+
import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
6+
import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
7+
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
8+
import org.biojava.nbio.core.util.MagicNumber;
9+
10+
import java.io.File;
11+
import java.io.IOException;
12+
import java.io.InputStream;
13+
import java.nio.file.Files;
14+
import java.nio.file.Path;
15+
import java.nio.file.StandardOpenOption;
16+
import java.util.Collections;
17+
import java.util.Iterator;
18+
import java.util.LinkedHashMap;
19+
import java.util.Map;
20+
import java.util.Optional;
21+
import java.util.Spliterator;
22+
import java.util.Spliterators;
23+
import java.util.function.Consumer;
24+
import java.util.stream.Stream;
25+
import java.util.stream.StreamSupport;
26+
import java.util.zip.GZIPInputStream;
27+
28+
public class FastaStreamer {
29+
30+
private final Path path;
31+
private int batchSize = 1_000;
32+
private SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser;
33+
private SequenceCreatorInterface<AminoAcidCompound> sequenceCreator;
34+
private LinkedHashMap<String, ProteinSequence> chunk = new LinkedHashMap<>();
35+
private Iterator<Map.Entry<String, ProteinSequence>> iterator = Collections.emptyIterator();
36+
private boolean closed = false;
37+
38+
/**
39+
* The constructor is private. Created via the <tt>from(...)</tt> static factory method
40+
*
41+
* @param path the path to the file containing the FASTA content (possibly GZipped)
42+
*/
43+
private FastaStreamer(final Path path) {
44+
this.path = path;
45+
}
46+
47+
public static FastaStreamer from(final Path path) {
48+
return new FastaStreamer(path);
49+
}
50+
51+
public static FastaStreamer from(File file) {
52+
return from(file.toPath());
53+
}
54+
55+
public FastaStreamer withHeaderParser(SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser) {
56+
this.headerParser = headerParser;
57+
return this;
58+
}
59+
60+
public FastaStreamer withSequenceCreator(SequenceCreatorInterface<AminoAcidCompound> sequenceCreator) {
61+
this.sequenceCreator = sequenceCreator;
62+
return this;
63+
}
64+
65+
public FastaStreamer batchSize(int size) {
66+
this.batchSize = size;
67+
return this;
68+
}
69+
70+
/**
71+
* Create a stream of protein sequences from the contents of the path
72+
* @return the stream
73+
* @throws IOException if there is an error opening the file
74+
*/
75+
public Stream<ProteinSequence> stream() throws IOException {
76+
InputStream rawInput = Files.newInputStream(getPath(), StandardOpenOption.READ);
77+
InputStream input = MagicNumber.isGZIP(getPath()) ? new GZIPInputStream(rawInput) : rawInput;
78+
FastaReader<ProteinSequence, AminoAcidCompound> reader = new FastaReader<>(input, getHeaderParser(), getSequenceCreator());
79+
Spliterator<ProteinSequence> source = new Spliterators.AbstractSpliterator<>(Integer.MAX_VALUE, Spliterator.IMMUTABLE | Spliterator.NONNULL) {
80+
@Override
81+
public boolean tryAdvance(Consumer<? super ProteinSequence> action) {
82+
if (closed) {
83+
return false;
84+
}
85+
ProteinSequence protein = next(reader);
86+
if (null == protein) {
87+
return false;
88+
}
89+
action.accept(protein);
90+
return true;
91+
}
92+
93+
/**
94+
* Fetch the next header/protein tuple from the cache. If the cache is empty, fetch another
95+
* batch from the source file
96+
*
97+
* @param reader
98+
* the input stream from which the FASTA content is read
99+
* @return the protein sequence
100+
*/
101+
private ProteinSequence next(FastaReader<ProteinSequence, AminoAcidCompound> reader) {
102+
try {
103+
if (!iterator.hasNext()) {
104+
chunk = reader.process(getBatchSize());
105+
if (null == chunk) {
106+
closed = true;
107+
reader.close();
108+
return null;
109+
}
110+
iterator = chunk.entrySet().iterator();
111+
}
112+
if (iterator.hasNext()) {
113+
Map.Entry<String, ProteinSequence> entry = iterator.next();
114+
return createSequence(entry.getKey(), entry.getValue());
115+
}
116+
closed = true;
117+
reader.close();
118+
} catch (IOException exception) {
119+
throw new RuntimeException(String.format("I/O error reading the FASTA file from '%s'", getPath()));
120+
}
121+
return null;
122+
}
123+
}; // Spliterator
124+
return StreamSupport.stream(source, false);
125+
}
126+
127+
/**
128+
* Create the sequence with the information from the header. This implementation return the sequence as-is, but
129+
* this is an opportunity for the implementer to build specifc information into the user collection space
130+
* of the sequence
131+
*
132+
* @param header the original header
133+
* @param sequence the protein sequence
134+
* @return the sequence
135+
*/
136+
protected ProteinSequence createSequence(String header, ProteinSequence sequence) {
137+
return sequence;
138+
}
139+
140+
protected Path getPath() {
141+
return path;
142+
}
143+
144+
protected int getBatchSize() {
145+
return batchSize;
146+
}
147+
148+
protected SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> getHeaderParser() {
149+
return Optional.ofNullable(headerParser).orElse(new GenericFastaHeaderParser<>());
150+
}
151+
152+
public SequenceCreatorInterface<AminoAcidCompound> getSequenceCreator() {
153+
return Optional.ofNullable(sequenceCreator).orElse(new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
154+
}
155+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package org.biojava.nbio.core.util;
2+
3+
import java.io.IOException;
4+
import java.io.InputStream;
5+
import java.nio.file.Files;
6+
import java.nio.file.Path;
7+
import java.nio.file.StandardOpenOption;
8+
9+
/**
10+
* The 'magic number' is a sequence of bytes that the beginning of a file that can be used to determine the
11+
* file type
12+
*
13+
* @since 7.0.3
14+
* @author Gary Murphy
15+
*/
16+
public class MagicNumber {
17+
18+
/**
19+
* The magic number of a gzip file is 0x1F8B. (ref: https://en.wikipedia.org/wiki/Gzip#:~:text=%22gzip%22%20is%20often%20also%20used,and%20the%20operating%20system%20ID.)
20+
* @param path the path to the file
21+
* @return <code>true</code> if the file has the gzip magic number
22+
* @throws IOException if there is an error reading the start of the file
23+
*/
24+
public static boolean isGZIP(Path path) throws IOException {
25+
try (
26+
InputStream input = Files.newInputStream(path, StandardOpenOption.READ)
27+
) {
28+
byte[] magic = new byte[2];
29+
int count = input.read(magic);
30+
if (count != 2) {
31+
return false;
32+
}
33+
int id = (int)magic[0] & 0x00ff;
34+
id <<= 8;
35+
id += (int)magic[1] & 0x00ff;
36+
return (id == 0x1f8b);
37+
}
38+
}
39+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package org.biojava.nbio.core.sequence.io;
2+
3+
import org.biojava.nbio.core.sequence.ProteinSequence;
4+
import org.junit.Assert;
5+
import org.junit.Test;
6+
7+
import java.io.IOException;
8+
import java.nio.file.Path;
9+
import java.nio.file.Paths;
10+
import java.util.List;
11+
import java.util.stream.Collectors;
12+
13+
/**
14+
* Test the functionality of the {@link FastaStreamer} code
15+
*/
16+
public class FastaStreamerTest {
17+
18+
@Test
19+
public void stream() throws IOException {
20+
String file = this.getClass().getResource("PF00104_small.fasta.gz").getFile();
21+
Path path = Paths.get(file);
22+
List<ProteinSequence> sequences;
23+
24+
sequences = FastaStreamer.from(path).stream().collect(Collectors.toList());
25+
Assert.assertEquals("Count", 283, sequences.size());
26+
27+
ProteinSequence sequence;
28+
sequence = sequences.get(0);
29+
Assert.assertEquals("A2D504_ATEGE/1-46", sequence.getOriginalHeader());
30+
sequence = sequences.get(sequences.size()-1);
31+
Assert.assertEquals("Q98SJ1_CHICK/15-61", sequence.getOriginalHeader());
32+
33+
sequences = FastaStreamer.from(path)
34+
.batchSize(2) // Ensure there isn't an edge condition loading the next buffer
35+
.stream()
36+
.collect(Collectors.toList());
37+
Assert.assertEquals("Count", 283, sequences.size());
38+
}
39+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package org.biojava.nbio.core.util;
2+
3+
import org.junit.Assert;
4+
import org.junit.Test;
5+
6+
import java.io.IOException;
7+
import java.nio.file.Path;
8+
import java.nio.file.Paths;
9+
10+
public class MagicNumberTest {
11+
12+
@Test
13+
public void gzip() throws IOException {
14+
String file = this.getClass().getResource("example.gz").getFile();
15+
Path path = Paths.get(file);
16+
Assert.assertTrue("GZIP file", MagicNumber.isGZIP(path));
17+
18+
file = this.getClass().getResource("build.xml").getFile();
19+
path = Paths.get(file);
20+
Assert.assertFalse("Not a GZIP file", MagicNumber.isGZIP(path));
21+
}
22+
}
Binary file not shown.

0 commit comments

Comments
 (0)