Merge pull request #660 from heuermh/read-rna-sequences

josemduarte · web-flow · commit be509152ae02 · 2017-05-31T12:35:08.000-05:00
Add methods for reading RNA sequences.
diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FastaReaderHelper.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FastaReaderHelper.java
@@ -23,10 +23,12 @@
 
 import org.biojava.nbio.core.sequence.DNASequence;
 import org.biojava.nbio.core.sequence.ProteinSequence;
+import org.biojava.nbio.core.sequence.RNASequence;
 import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
 import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
 import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
 import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
+import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
 
 import java.io.File;
 import java.io.FileInputStream;
@@ -68,6 +70,34 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file,
 
 	}
 
+	/**
+	 * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
+	 * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
+	 * in one sequence based on accession id.
+	 * @param file
+	 * @param lazySequenceLoad
+	 * @return
+	 * @throws IOException
+	 */
+	public static LinkedHashMap<String, RNASequence> readFastaRNASequence(File file, boolean lazySequenceLoad) throws IOException {
+		if (!lazySequenceLoad) {
+			return readFastaRNASequence(file);
+		}
+
+		FastaReader<RNASequence, NucleotideCompound> fastaProxyReader =
+				new FastaReader<RNASequence, NucleotideCompound>(
+						file,
+						new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
+						new FileProxyRNASequenceCreator(
+								file,
+								RNACompoundSet.getRNACompoundSet(),
+								new FastaSequenceParser()
+							)
+					);
+		return fastaProxyReader.process();
+
+	}
+
 	/**
 	 * Read a fasta file containing amino acids with setup that would handle most
 	 * cases.
@@ -130,6 +160,35 @@ public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
 		return dnaSequences;
 	}
 
+	/**
+	 * Read a fasta RNA sequence
+	 * @param inStream
+	 * @return
+	 * @throws IOException
+	 */
+	public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
+			InputStream inStream) throws IOException {
+		FastaReader<RNASequence, NucleotideCompound> fastaReader = new FastaReader<RNASequence, NucleotideCompound>(
+				inStream,
+				new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
+				new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
+		return fastaReader.process();
+	}
+
+	/**
+	 *
+	 * @param file
+	 * @return
+	 * @throws IOException
+	 */
+	public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
+			File file) throws IOException {
+		FileInputStream inStream = new FileInputStream(file);
+		LinkedHashMap<String, RNASequence> rnaSequences = readFastaRNASequence(inStream);
+		inStream.close();
+		return rnaSequences;
+	}
+
 	public static void main(String[] args) throws Exception {
 
 		LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna"));
diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FileProxyRNASequenceCreator.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FileProxyRNASequenceCreator.java
@@ -0,0 +1,112 @@
+/*
+ *                    BioJava development code
+ *
+ * This code may be freely distributed and modified under the
+ * terms of the GNU Lesser General Public Licence.  This should
+ * be distributed with the code.  If you do not have a copy,
+ * see:
+ *
+ *      http://www.gnu.org/copyleft/lesser.html
+ *
+ * Copyright for this code is held jointly by the individual
+ * authors.  These should be listed in @author doc comments.
+ *
+ * For more information on the BioJava project and its aims,
+ * or to join the biojava-l mailing list, visit the home page
+ * at:
+ *
+ *      http://www.biojava.org/
+ *
+ * Created on 01-21-2010
+ */
+package org.biojava.nbio.core.sequence.io;
+
+import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
+import org.biojava.nbio.core.sequence.RNASequence;
+import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
+import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
+import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
+import org.biojava.nbio.core.sequence.loader.SequenceFileProxyLoader;
+import org.biojava.nbio.core.sequence.template.AbstractSequence;
+import org.biojava.nbio.core.sequence.template.CompoundSet;
+import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
+ * the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
+ * This way you can load very large fasta files and store accession id and delay loading the sequence to save
+ * memory. The index is the file stream offset so when a RNASequence has a call to getSequence() the
+ * SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
+ *
+ * Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
+ * interface to a remote server
+ *
+ * @author Scooter Willis <willishf at gmail dot com>
+ */
+public class FileProxyRNASequenceCreator implements
+		SequenceCreatorInterface<NucleotideCompound> {
+
+	CompoundSet<NucleotideCompound> compoundSet = null;
+	File file = null;
+	SequenceParserInterface sequenceParser;
+
+	/**
+	 * Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
+	 * @param fastaFile
+	 * @param compoundSet
+	 */
+	public FileProxyRNASequenceCreator(File file,
+			CompoundSet<NucleotideCompound> compoundSet,
+			SequenceParserInterface sequenceParser) {
+		this.compoundSet = compoundSet;
+		this.file = file;
+		this.sequenceParser = sequenceParser;
+	}
+
+	/**
+	 * Even though we are passing in the sequence we really only care about the length of the sequence and the offset
+	 * index in the fasta file.
+	 * @param sequence
+	 * @param index
+	 * @return
+	 * @throws CompoundNotFoundException
+	 * @throws IOException
+	 */
+	@Override
+	public AbstractSequence<NucleotideCompound> getSequence(String sequence, long index ) throws CompoundNotFoundException, IOException {
+		SequenceFileProxyLoader<NucleotideCompound> sequenceFileProxyLoader = new SequenceFileProxyLoader<NucleotideCompound>(
+				file,
+				sequenceParser,
+				index,
+				sequence.length(),
+				compoundSet);
+		return new RNASequence(sequenceFileProxyLoader, compoundSet);
+	}
+
+	/**
+	 * Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
+	 * @param proxyLoader
+	 * @param index
+	 * @return
+	 */
+	@Override
+	public AbstractSequence<NucleotideCompound> getSequence(
+			ProxySequenceReader<NucleotideCompound> proxyLoader, long index) {
+		throw new UnsupportedOperationException("Not supported yet.");
+	}
+
+	/**
+	 * Not sure of use case and currently not supported
+	 * @param list
+	 * @return
+	 */
+	@Override
+	public AbstractSequence<NucleotideCompound> getSequence(
+			List<NucleotideCompound> list) {
+		throw new UnsupportedOperationException("Not supported yet.");
+	}
+}
diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReaderHelper.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReaderHelper.java
@@ -23,10 +23,12 @@
 
 import org.biojava.nbio.core.sequence.DNASequence;
 import org.biojava.nbio.core.sequence.ProteinSequence;
+import org.biojava.nbio.core.sequence.RNASequence;
 import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
 import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
 import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
 import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
+import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
 import org.biojava.nbio.core.sequence.template.AbstractSequence;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -99,6 +101,35 @@ public static LinkedHashMap<String, ProteinSequence> readGenbankProteinSequence(
 		return GenbankProxyReader.process();
 
 	}
+
+	/**
+	 * Selecting lazySequenceLoad=true will parse the Genbank file and figure out the accessionid and offsets and return sequence objects
+	 * that can in the future read the sequence from the disk. This allows the loading of large Genbank files where you are only interested
+	 * in one sequence based on accession id.
+	 * @param file
+	 * @param lazySequenceLoad
+	 * @return
+	 * @throws Exception
+	 */
+	public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(File file, boolean lazySequenceLoad) throws Exception {
+		if (!lazySequenceLoad) {
+			return readGenbankRNASequence(file);
+		}
+
+		GenbankReader<RNASequence, NucleotideCompound> GenbankProxyReader =
+				new GenbankReader<RNASequence, NucleotideCompound>(
+						file,
+						new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
+						new FileProxyRNASequenceCreator(
+								file,
+								RNACompoundSet.getRNACompoundSet(),
+								new GenbankSequenceParser<AbstractSequence<NucleotideCompound>, NucleotideCompound>()
+							)
+					);
+		return GenbankProxyReader.process();
+
+	}
+
 	/**
 	 * Read a Genbank file containing amino acids with setup that would handle most
 	 * cases.
@@ -160,6 +191,34 @@ public static LinkedHashMap<String, DNASequence> readGenbankDNASequence(
 		inStream.close();
 		return dnaSequences;
 	}
+	/**
+	 * Read a Genbank RNA sequence
+	 * @param inStream
+	 * @return
+	 * @throws Exception
+	 */
+	public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
+			InputStream inStream) throws Exception {
+		GenbankReader<RNASequence, NucleotideCompound> GenbankReader = new GenbankReader<RNASequence, NucleotideCompound>(
+				inStream,
+				new GenericGenbankHeaderParser<RNASequence, NucleotideCompound>(),
+				new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
+		return GenbankReader.process();
+	}
+
+	/**
+	 *
+	 * @param file
+	 * @return
+	 * @throws Exception
+	 */
+	public static LinkedHashMap<String, RNASequence> readGenbankRNASequence(
+			File file) throws Exception {
+		FileInputStream inStream = new FileInputStream(file);
+		LinkedHashMap<String, RNASequence> rnaSequences = readGenbankRNASequence(inStream);
+		inStream.close();
+		return rnaSequences;
+	}
 
 	public static void main(String[] args) throws Exception {