diff --git a/.gitattributes b/.gitattributes index a90ac4bbb2..9e293e1595 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,7 +37,7 @@ *.sto text *.tsv text *.txt text -*.xml text +*.xml text eol=lf #Causing decompression test to fail when line endings in org/biojava/nbio/core/util/build.xml are crlf *.xsd text *.yml text diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java index fb575f8ab1..951cce40c0 100644 --- a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java @@ -26,15 +26,8 @@ package org.biojava.nbio.core.sequence.io; import org.biojava.nbio.core.exceptions.CompoundNotFoundException; -import org.biojava.nbio.core.sequence.DNASequence; import org.biojava.nbio.core.sequence.DataSource; -import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.TaxonomyID; -import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; -import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; -import org.biojava.nbio.core.sequence.compound.DNACompoundSet; -import org.biojava.nbio.core.sequence.compound.NucleotideCompound; -import org.biojava.nbio.core.sequence.features.AbstractFeature; import org.biojava.nbio.core.sequence.features.DBReferenceInfo; import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; @@ -43,13 +36,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.List; /** - * Use GenbankReaderHelper as an example of how to use this class where GenbankReaderHelper should be the + * Use {@link GenbankReaderHelper} as an example of how to use this class where {@link GenbankReaderHelper} should be the * primary class used to read Genbank files * */ @@ -66,9 +66,9 @@ public boolean isClosed() { } /** - * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about - * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in - * an inputstream is forced to read all the data so you don't gain anything. + * If you are going to use {@link FileProxyProteinSequenceCreator} then do not use this constructor because we need details about + * local file offsets for quick reads. {@link InputStream} does not give you the name of the stream to access quickly via file seek. A seek in + * an {@link InputStream} is forced to read all the data so you don't gain anything. * @param is * @param headerParser * @param sequenceCreator @@ -107,18 +107,21 @@ public GenbankReader( /** * The parsing is done in this method.
- * This method tries to process all the available Genbank records + * This method will return all the available Genbank records * in the File or InputStream, closes the underlying resource, * and return the results in {@link LinkedHashMap}.
- * You don't need to call {@link #close()} after calling this method. + * You don't need to call {@link GenbankReader#close()} after calling this method. * @see #process(int) * @return {@link HashMap} containing all the parsed Genbank records * present, starting current fileIndex onwards. * @throws IOException * @throws CompoundNotFoundException + * @throws OutOfMemoryError if the input resource is larger than the allocated heap. */ public LinkedHashMap process() throws IOException, CompoundNotFoundException { - return process(-1); + LinkedHashMap result = process(-1); + close(); + return result; } /** @@ -137,13 +140,18 @@ public LinkedHashMap process() throws IOException, CompoundNotFoundExc * @see #process() * @author Amr AL-Hossary * @since 3.0.6 - * @param max maximum number of records to return, -1 for infinity. + * @param max maximum number of records to return. * @return {@link HashMap} containing maximum max parsed Genbank records * present, starting current fileIndex onwards. * @throws IOException * @throws CompoundNotFoundException */ public LinkedHashMap process(final int max) throws IOException, CompoundNotFoundException { + + if(closed){ + throw new IOException("Cannot perform action: resource has been closed."); + } + LinkedHashMap sequences = new LinkedHashMap<>(); @SuppressWarnings("unchecked") int i=0; @@ -158,12 +166,9 @@ public LinkedHashMap process(final int max) throws IOException, Compou genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence); // add features to new sequence - for (String k: genbankParser.getFeatures().keySet()){ - for (AbstractFeature f: genbankParser.getFeatures(k)){ - //f.getLocations().setSequence(sequence); // can't set proper sequence source to features. It is actually needed? Don't think so... - sequence.addFeature(f); - } - } + genbankParser.getFeatures().values().stream() + .flatMap(List::stream) + .forEach(sequence::addFeature); // add taxonomy ID to new sequence ArrayList dbQualifier = genbankParser.getDatabaseReferences().get("db_xref"); @@ -175,10 +180,6 @@ public LinkedHashMap process(final int max) throws IOException, Compou sequences.put(sequence.getAccession().getID(), sequence); } - if (max < 0) { - close(); - } - return sequences; } @@ -187,33 +188,9 @@ public void close() { bufferedReader.close(); this.closed = true; } catch (IOException e) { - logger.error("Couldn't close the reader. {}", e.getMessage()); + logger.error("Couldn't close the reader.", e); this.closed = false; } } - - public static void main(String[] args) throws Exception { - String proteinFile = "src/test/resources/BondFeature.gb"; - FileInputStream is = new FileInputStream(proteinFile); - - GenbankReader proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); - LinkedHashMap proteinSequences = proteinReader.process(); - System.out.println(proteinSequences); - - String inputFile = "src/test/resources/NM_000266.gb"; - is = new FileInputStream(inputFile); - GenbankReader dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); - LinkedHashMap dnaSequences = dnaReader.process(); - System.out.println(dnaSequences); - - String crazyFile = "src/test/resources/CraftedFeature.gb"; - is = new FileInputStream(crazyFile); - GenbankReader crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); - LinkedHashMap crazyAnnotatedSequences = crazyReader.process(); - - is.close(); - System.out.println(crazyAnnotatedSequences); - } - } diff --git a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java index 045900263a..2b74f9cacd 100644 --- a/biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java +++ b/biojava-core/src/main/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReader.java @@ -32,7 +32,11 @@ import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; import org.biojava.nbio.core.sequence.compound.DNACompoundSet; import org.biojava.nbio.core.sequence.compound.NucleotideCompound; -import org.biojava.nbio.core.sequence.features.*; +import org.biojava.nbio.core.sequence.features.AbstractFeature; +import org.biojava.nbio.core.sequence.features.DBReferenceInfo; +import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; +import org.biojava.nbio.core.sequence.features.FeatureRetriever; +import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; import org.biojava.nbio.core.sequence.io.GenbankSequenceParser; import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser; import org.biojava.nbio.core.sequence.template.AbstractSequence; @@ -41,7 +45,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; @@ -54,7 +65,7 @@ */ public class GenbankProxySequenceReader extends StringProxySequenceReader implements FeaturesKeyWordInterface, DatabaseReferenceInterface, FeatureRetriever { - private final static Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); + private static final Logger logger = LoggerFactory.getLogger(GenbankProxySequenceReader.class); private static final String eutilBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; // private String genbankDirectoryCache = null; diff --git a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java index c9f3a0cd8a..4845da4cb1 100644 --- a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java +++ b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java @@ -20,13 +20,6 @@ */ package org.biojava.nbio.core.sequence.io; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - import org.biojava.nbio.core.exceptions.CompoundNotFoundException; import org.biojava.nbio.core.sequence.DNASequence; import org.biojava.nbio.core.sequence.ProteinSequence; @@ -46,8 +39,20 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + import static org.hamcrest.CoreMatchers.is; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; /** * @@ -161,7 +166,7 @@ public void testProcess() throws Exception { */ @Test public void testPartialProcess() throws IOException, CompoundNotFoundException, NoSuchFieldException { - InputStream inStream = this.getClass().getResourceAsStream("/two-dnaseqs.gb"); + CheckableInputStream inStream = new CheckableInputStream(this.getClass().getResourceAsStream("/two-dnaseqs.gb")); GenbankReader genbankDNA = new GenbankReader<>( @@ -173,12 +178,14 @@ public void testPartialProcess() throws IOException, CompoundNotFoundException, // First call to process(1) returns the first sequence LinkedHashMap dnaSequences = genbankDNA.process(1); + assertFalse(inStream.isclosed()); assertNotNull(dnaSequences); assertEquals(1, dnaSequences.size()); assertNotNull(dnaSequences.get("vPetite")); // Second call to process(1) returns the second sequence dnaSequences = genbankDNA.process(1); + assertFalse(inStream.isclosed()); assertNotNull(dnaSequences); assertEquals(1, dnaSequences.size()); assertNotNull(dnaSequences.get("sbFDR")); @@ -186,14 +193,14 @@ public void testPartialProcess() throws IOException, CompoundNotFoundException, assertFalse(genbankDNA.isClosed()); genbankDNA.close(); assertTrue(genbankDNA.isClosed()); - + assertTrue(inStream.isclosed()); } @Test public void CDStest() throws Exception { logger.info("CDS Test"); - InputStream inStream = this.getClass().getResourceAsStream("/BondFeature.gb"); + CheckableInputStream inStream = new CheckableInputStream(this.getClass().getResourceAsStream("/BondFeature.gb")); assertNotNull(inStream); GenbankReader GenbankProtein @@ -203,7 +210,7 @@ public void CDStest() throws Exception { new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()) ); LinkedHashMap proteinSequences = GenbankProtein.process(); - inStream.close(); + assertTrue(inStream.isclosed()); Assert.assertTrue(proteinSequences.size() == 1); @@ -260,4 +267,27 @@ public void testNcbiExpandedAccessionFormats() throws Exception { DNASequence header2 = readGenbankResource("/empty_header2.gb"); assertEquals("AZZZAA02123456789 10000000000 bp DNA linear PRI 15-OCT-2018", header2.getOriginalHeader()); } + + /** + * Helper class to be able to verify the closed state of the input stream. + */ + private class CheckableInputStream extends BufferedInputStream { + + private boolean closed; + + CheckableInputStream(InputStream in) { + super(in); + closed = false; + } + + @Override + public void close() throws IOException { + super.close(); + closed = true; + } + + boolean isclosed() { + return closed; + } + } } diff --git a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java index 6d2180a641..2205de4b27 100644 --- a/biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java +++ b/biojava-core/src/test/java/org/biojava/nbio/core/sequence/loader/GenbankProxySequenceReaderTest.java @@ -24,21 +24,27 @@ import org.biojava.nbio.core.sequence.ProteinSequence; import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; +import org.biojava.nbio.core.sequence.features.AbstractFeature; import org.biojava.nbio.core.sequence.features.FeatureInterface; +import org.biojava.nbio.core.sequence.features.Qualifier; import org.biojava.nbio.core.sequence.template.AbstractSequence; import org.junit.Assert; +import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.nio.channels.Channels; +import java.nio.channels.ReadableByteChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import org.biojava.nbio.core.sequence.features.AbstractFeature; -import org.biojava.nbio.core.sequence.features.Qualifier; /** * Testing example for issue #834 @@ -58,7 +64,7 @@ public GenbankProxySequenceReaderTest(String gi) { @Parameterized.Parameters public static Collection getExamples() { - String[][] out = new String[][]{ + String[][] accessorIds = new String[][]{ {"399235158"}, {"7525057"}, {"379015144"}, @@ -69,9 +75,52 @@ public static Collection getExamples() { {"254839678"} }; - return Arrays.asList(out); + return Arrays.asList(accessorIds); + } + + /** + * In {@link GenbankProxySequenceReader} there is a check to see if the requested files are already in the temp + * directory before attempting to retrieve them from the remote server. so simply copying the test files to the temp + * directory avoids calling out to the server and hitting a 429 status code from the server which fails the build. + * @throws IOException + */ + @Before + public void copyTestFiles() throws IOException { + Collection accessorIds = getExamples(); + for (String[] arr: accessorIds) { + copyTestFileToWorkingDirectory(arr[0]+".gb"); + } } + /** + * Convenience method for {@link GenbankProxySequenceReaderTest#copyTestFiles()} + * @param filename name of the file to copy from the resource folder + * @throws IOException when something goes wrong with copying the files. + */ + private void copyTestFileToWorkingDirectory(String filename) throws IOException { + String destRoot = System.getProperty("java.io.tmpdir"); + + //if the directory does not end with a slash or backslash then add one + if(!(destRoot.endsWith("/") || destRoot.endsWith("\\"))){ + destRoot += destRoot.contains("/")? "/" : "\\"; + } + + String dest = destRoot + filename; + String src = "org/biojava/nbio/core/sequence/loader/" + filename; + + //Remove any pre-existing files + File d = new File(dest); + d.delete(); + + try(FileOutputStream destination = new FileOutputStream(d); + InputStream is = this.getClass().getClassLoader().getResourceAsStream(src); + ReadableByteChannel source = Channels.newChannel(is)) { + + destination.getChannel().transferFrom(source, 0, Long.MAX_VALUE); + } + } + + @Test public void testFeatures() throws IOException, InterruptedException, CompoundNotFoundException { logger.info("run test for protein: {}", gi); @@ -120,9 +169,6 @@ so it should be done here (manualy). Assert.assertTrue(!codedBy.isEmpty()); logger.info("\t\tcoded_by: {}", codedBy); } - - // genbank has limits on requests per second, we need to give it some time for next test or otherwise we get 429 http error codes - JD 2018-12-14 - Thread.sleep(500); } @Test @@ -161,9 +207,5 @@ public void testProteinSequenceFactoring() throws Exception { } else { logger.info("target {} has no CDS", gi); } - - // genbank has limits on requests per second, we need to give it some time for next test or otherwise we get 429 http error codes - JD 2018-12-14 - Thread.sleep(500); - } } diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/152970917.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/152970917.gb new file mode 100644 index 0000000000..70d24fa039 --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/152970917.gb @@ -0,0 +1,70 @@ +LOCUS YP_001336026 324 aa linear CON 16-DEC-2014 +DEFINITION lipid A biosynthesis (KDO)2-(lauroyl)-lipid IVA acyltransferase + [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]. +ACCESSION YP_001336026 +VERSION YP_001336026.1 +DBLINK BioProject: PRJNA57619 +DBSOURCE REFSEQ: accession NC_009648.1 +KEYWORDS RefSeq. +SOURCE Klebsiella pneumoniae subsp. pneumoniae MGH 78578 + ORGANISM Klebsiella pneumoniae subsp. pneumoniae MGH 78578 + Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; + Enterobacteriaceae; Klebsiella. +REFERENCE 1 (residues 1 to 324) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (09-JUL-2007) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 2 (residues 1 to 324) + AUTHORS McClelland,M., Sanderson,E.K., Spieth,J., Clifton,W.S., + Latreille,P., Sabo,A., Pepin,K., Bhonagiri,V., Porwollik,S., Ali,J. + and Wilson,R.K. + CONSRTM The Klebsiella pneumonia Genome Sequencing Project + TITLE Direct Submission + JOURNAL Submitted (06-SEP-2006) Genetics, Genome Sequencing Center, 4444 + Forest Park Parkway, St. Louis, MO 63108, USA +COMMENT VALIDATED REFSEQ: This record has undergone validation or + preliminary review. The reference sequence was derived from + ABR77796. + Method: conceptual translation. +FEATURES Location/Qualifiers + source 1..324 + /organism="Klebsiella pneumoniae subsp. pneumoniae MGH + 78578" + /strain="ATCC 700721; MGH 78578" + /sub_species="pneumoniae" + /db_xref="ATCC:700721" + /db_xref="taxon:272620" + Protein 1..324 + /product="lipid A biosynthesis (KDO)2-(lauroyl)-lipid IVA + acyltransferase" + /calculated_mol_wt=37353 + Region 1..310 + /region_name="PRK08943" + /note="lipid A biosynthesis (KDO)2-(lauroyl)-lipid IVA + acyltransferase; Validated" + /db_xref="CDD:236355" + Site order(139,142,144,161..164,210..212) + /site_type="other" + /note="putative acyl-acceptor binding pocket" + /db_xref="CDD:153246" + CDS 1..324 + /gene="msbB" + /locus_tag="KPN_02370" + /coded_by="complement(NC_009648.1:2595658..2596632)" + /inference="ab initio prediction:Genemark:2.0" + /inference="protein motif:Pfam:IPR004960" + /note="Transfers myristate or laurate, activated on ACP, + to the lipid IVA moiety of (KDO)2-(lauroyl)-lipid IVA" + /transl_table=11 + /db_xref="GeneID:5340071" +CONTIG join(WP_002911442.1:1..324) +ORIGIN + 1 metkknnief ipkfeksfll prywgawlgv fafagialtp psfrdpllgk lgrlvgrlak + 61 ssrrraqinl lycfpeksey ereaiidamy asapqamvmm aelglrdpqk ilarvdwqgk + 121 aiidemqrnn ekviflvpha wgvdipamlm asggqkmaam fhnqgnpvfd yvwntvrrrf + 181 ggrmharndg ikpfiqsvrq gywgyylpdq dhgaehsefv dffatykatl paigrlmkvc + 241 rarvvplfpv ydskthrltv lvrppmddll daddttiarr mneevevfvk phteqytwil + 301 kllktrkpge iepykrkelf pkkk +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/254839678.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/254839678.gb new file mode 100644 index 0000000000..6a3022cbe7 --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/254839678.gb @@ -0,0 +1,114 @@ +LOCUS 3IAN_A 321 aa linear BCT 24-NOV-2018 +DEFINITION Chain A, Chitinase. +ACCESSION 3IAN_A +VERSION 3IAN_A +DBSOURCE pdb: molecule 3IAN, chain 65, release Nov 21, 2018; + deposition: Jul 14, 2009; + class: HYDROLASE; + source: Mmdb_id: 999999, Pdb_id 1: 3IAN; + Exp. method: X-ray Diffraction. +KEYWORDS . +SOURCE Lactococcus lactis subsp. lactis + ORGANISM Lactococcus lactis subsp. lactis + Bacteria; Firmicutes; Bacilli; Lactobacillales; Streptococcaceae; + Lactococcus. +REFERENCE 1 (residues 1 to 321) + AUTHORS Bonanno,J.B., Rutter,M., Bain,K.T., Miller,S., Ozyurt,S., + Sauder,J.M., Burley,S.K. and Almo,S.C. + TITLE Crystal structure of a chitinase from Lactococcus lactis subsp. + lactis + JOURNAL Unpublished +REFERENCE 2 (residues 1 to 321) + AUTHORS Bonanno,J.B., Rutter,M., Bain,K.T., Miller,S., Ozyurt,S., + Sauder,J.M., Burley,S.K., Almo,S.C. and New York SGX Research + Center for Structural Genomics (NYSGXRC). + TITLE Direct Submission + JOURNAL Submitted (14-JUL-2009) +COMMENT Crystal structure of a chitinase from Lactococcus lactis subsp. + lactis. +FEATURES Location/Qualifiers + source 1..321 + /organism="Lactococcus lactis subsp. lactis" + /sub_species="lactis" + /db_xref="taxon:1360" + Het join(bond(115),bond(117),bond(76)) + /heterogen="(NA,2572)" + Region 4..313 + /region_name="Chi1" + /note="Chitinase [Carbohydrate transport and metabolism]; + COG3469" + /db_xref="CDD:226000" + Region 5..288 + /region_name="Glyco_hydro_18" + /note="Glycosyl hydrolases family 18; pfam00704" + /db_xref="CDD:279094" + SecStr 5..12 + /sec_str_type="sheet" + /note="strand 1" + Site order(10,46,122,124,189,191,283) + /site_type="active" + /note="putative active site [active]" + /db_xref="CDD:119350" + SecStr 24..28 + /sec_str_type="sheet" + /note="strand 2" + SecStr 40..45 + /sec_str_type="sheet" + /note="strand 3" + SecStr 65..78 + /sec_str_type="helix" + /note="helix 1" + SecStr 80..89 + /sec_str_type="sheet" + /note="strand 4" + SecStr 100..114 + /sec_str_type="helix" + /note="helix 2" + SecStr 117..124 + /sec_str_type="sheet" + /note="strand 5" + SecStr 133..151 + /sec_str_type="helix" + /note="helix 3" + SecStr 155..163 + /sec_str_type="sheet" + /note="strand 6" + SecStr 172..180 + /sec_str_type="helix" + /note="helix 4" + SecStr 184..190 + /sec_str_type="sheet" + /note="strand 7" + SecStr 196..201 + /sec_str_type="sheet" + /note="strand 8" + SecStr 204..209 + /sec_str_type="sheet" + /note="strand 9" + SecStr 215..228 + /sec_str_type="helix" + /note="helix 5" + SecStr 240..246 + /sec_str_type="sheet" + /note="strand 10" + SecStr 261..273 + /sec_str_type="helix" + /note="helix 6" + SecStr 278..283 + /sec_str_type="sheet" + /note="strand 11" + SecStr 289..293 + /sec_str_type="sheet" + /note="strand 12" + SecStr 300..307 + /sec_str_type="helix" + /note="helix 7" +ORIGIN + 1 msldkvlvgy whnwkstgkd gykggssadf nlsstqegyn vinvsfmktp egqtlptfkp + 61 ynktdtefra eisklnaegk svlialggad ahielkksqe sdfvneiirl vdtygfdgld + 121 idleqaaiea adnqtvipsa lkkvkdhyrk dgknfmitma pefpyltssg kyapyinnld + 181 syydfinpqy ynqggdgfwd sdlnmwisqs ndekkedfly gltqrlvtgt dgfikipask + 241 fviglpsnnd aaatgyvkdp navknalnrl kasgneikgl mtwsvnwdag tnsngekynn + 301 tfvntyapml fnneghhhhh h +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/379015144.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/379015144.gb new file mode 100644 index 0000000000..667440c7a7 --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/379015144.gb @@ -0,0 +1,66 @@ +LOCUS YP_005291380 338 aa linear CON 17-DEC-2014 +DEFINITION leukocidin/hemolysin toxin family protein [Staphylococcus aureus + subsp. aureus VC40]. +ACCESSION YP_005291380 +VERSION YP_005291380.1 +DBLINK BioProject: PRJNA88071 +DBSOURCE REFSEQ: accession NC_016912.1 +KEYWORDS RefSeq. +SOURCE Staphylococcus aureus subsp. aureus VC40 + ORGANISM Staphylococcus aureus subsp. aureus VC40 + Bacteria; Firmicutes; Bacilli; Bacillales; Staphylococcus. +REFERENCE 1 (residues 1 to 338) + AUTHORS Sass,P., Berscheid,A., Jansen,A., Oedenkoven,M., Szekat,C., + Strittmatter,A., Gottschalk,G. and Bierbaum,G. + TITLE Genome sequence of Staphylococcus aureus VC40, a vancomycin- and + daptomycin-resistant strain, to study the genetics of development + of resistance to currently applied last-resort antibiotics + JOURNAL J. Bacteriol. 194 (8), 2107-2108 (2012) + PUBMED 22461548 +REFERENCE 2 (residues 1 to 338) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (21-FEB-2012) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 3 (residues 1 to 338) + AUTHORS Sass,P., Berscheid,A., Jansen,A., Oedenkoven,M., Szekat,C., + Strittmatter,A., Gottschalk,G. and Bierbaum,G. + TITLE Direct Submission + JOURNAL Submitted (25-AUG-2011) Institute of Medical Microbiology, + Immunology and Parasitology, University of Bonn, Sigmund-Freud-Str. + 25, Bonn 53105, Germany +COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final + NCBI review. The reference sequence is identical to AEZ37946. + Method: conceptual translation. +FEATURES Location/Qualifiers + source 1..338 + /organism="Staphylococcus aureus subsp. aureus VC40" + /strain="VC40" + /sub_species="aureus" + /db_xref="taxon:1028799" + /country="Germany" + /collection_date="2002" + /note="laboratory mutant selected for 60 microgram per ml + vancomycin resistance" + Protein 1..338 + /product="leukocidin/hemolysin toxin family protein" + /calculated_mol_wt=38555 + Region 65..323 + /region_name="Leukocidin" + /note="Leukocidin/Hemolysin toxin family; pfam07968" + /db_xref="CDD:311773" + CDS 1..338 + /locus_tag="SAVC_08965" + /coded_by="complement(NC_016912.1:1946987..1948003)" + /transl_table=11 + /db_xref="GeneID:11869971" +CONTIG join(WP_000595324.1:1..338) +ORIGIN + 1 mikqlcknit ictlalsttf tvlpatsfak inseikqvse knldgdtkmy trtattsdsq + 61 knitqslqfn fltepnydke tvfikakgti gsglrildpn gywnstlrwp gsysvsiqnv + 121 ddnnntnvtd fapknqdesr evkytygykt ggdfsinrgg ltgnitkesn ysetisyqqp + 181 syrtlldqst shkgvgwkve ahlinnmghd htrqltndsd nrtkseifsl trngnlwakd + 241 nftpkdkmpv tvsegfnpef lavmshdkkd kgksqfvvhy krsmdefkid wnrhgfwgyw + 301 sgenhvdkke eklsalyevd wkthnvkfvk vlndnekk +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353147.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353147.gb new file mode 100644 index 0000000000..b965e249c5 --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353147.gb @@ -0,0 +1,186 @@ +LOCUS 4AE0_A 535 aa linear BCT 10-OCT-2012 +DEFINITION Chain A, Diphtheria Toxin. +ACCESSION 4AE0_A +VERSION 4AE0_A +DBSOURCE pdb: molecule 4AE0, chain 65, release Apr 18, 2012; + deposition: Jan 4, 2012; + class: Toxin; + source: Mmdb_id: 98377, Pdb_id 1: 4AE0; + Exp. method: X-Ray Diffraction. +KEYWORDS . +SOURCE Corynebacterium diphtheriae + ORGANISM Corynebacterium diphtheriae + Bacteria; Actinobacteria; Corynebacteriales; Corynebacteriaceae; + Corynebacterium. +REFERENCE 1 (residues 1 to 535) + AUTHORS Malito,E., Bursulaya,B., Chen,C., Surdo,P.L., Picchianti,M., + Balducci,E., Biancucci,M., Brock,A., Berti,F., Bottomley,M.J., + Nissum,M., Costantino,P., Rappuoli,R. and Spraggon,G. + TITLE Structural basis for lack of toxicity of the diphtheria toxin + mutant CRM197 + JOURNAL Proc. Natl. Acad. Sci. U.S.A. 109 (14), 5229-5234 (2012) + PUBMED 22431623 +REFERENCE 2 (residues 1 to 535) + AUTHORS Malito,E. and Spraggon,G. + TITLE Direct Submission + JOURNAL Submitted (04-JAN-2012) +COMMENT Crystal Structure Of Diphtheria Toxin Mutant Crm197. +FEATURES Location/Qualifiers + source 1..535 + /organism="Corynebacterium diphtheriae" + /db_xref="taxon:1717" + Region 1..187 + /region_name="Diphtheria_C" + /note="Diphtheria toxin, C domain; pfam02763" + /db_xref="CDD:280859" + Region 1..171 + /region_name="Domain 1" + /note="NCBI Domains" + SecStr 10..15 + /sec_str_type="sheet" + /note="strand 1" + SecStr 16..22 + /sec_str_type="sheet" + /note="strand 2" + Site order(20..24,27,31,34..36,38,53..55,65,148) + /site_type="other" + /note="nad+ binding pocket [chemical binding]" + /db_xref="CDD:238651" + SecStr 52..58 + /sec_str_type="sheet" + /note="strand 3" + SecStr 65..68 + /sec_str_type="sheet" + /note="strand 4" + SecStr 78..86 + /sec_str_type="sheet" + /note="strand 5" + SecStr 87..94 + /sec_str_type="sheet" + /note="strand 6" + SecStr 99..106 + /sec_str_type="helix" + /note="helix 1" + SecStr 120..127 + /sec_str_type="helix" + /note="helix 2" + SecStr 132..139 + /sec_str_type="sheet" + /note="strand 7" + SecStr 146..152 + /sec_str_type="sheet" + /note="strand 8" + SecStr 159..167 + /sec_str_type="sheet" + /note="strand 9" + Region 172..382 + /region_name="Domain 2" + /note="NCBI Domains" + SecStr 176..183 + /sec_str_type="helix" + /note="helix 3" + Bond bond(186,201) + /bond_type="disulfide" + Region 200..379 + /region_name="Diphtheria_T" + /note="Diphtheria toxin, T domain; pfam02764" + /db_xref="CDD:280860" + SecStr 206..223 + /sec_str_type="helix" + /note="helix 4" + SecStr 224..232 + /sec_str_type="helix" + /note="helix 5" + SecStr 240..255 + /sec_str_type="helix" + /note="helix 6" + SecStr 260..268 + /sec_str_type="helix" + /note="helix 7" + SecStr 275..288 + /sec_str_type="helix" + /note="helix 8" + SecStr 297..304 + /sec_str_type="helix" + /note="helix 9" + SecStr 315..318 + /sec_str_type="sheet" + /note="strand 10" + SecStr 319..322 + /sec_str_type="sheet" + /note="strand 11" + SecStr 326..343 + /sec_str_type="helix" + /note="helix 10" + SecStr 359..376 + /sec_str_type="helix" + /note="helix 11" + Region 381..534 + /region_name="Diphtheria_R" + /note="Diphtheria toxin, R domain; pfam01324" + /db_xref="CDD:279642" + Region 383..535 + /region_name="Domain 3" + /note="NCBI Domains" + SecStr 388..392 + /sec_str_type="sheet" + /note="strand 12" + SecStr 393..399 + /sec_str_type="sheet" + /note="strand 13" + SecStr 404..408 + /sec_str_type="sheet" + /note="strand 14" + SecStr 409..413 + /sec_str_type="sheet" + /note="strand 15" + SecStr 414..423 + /sec_str_type="sheet" + /note="strand 16" + SecStr 424..427 + /sec_str_type="sheet" + /note="strand 17" + SecStr 428..436 + /sec_str_type="sheet" + /note="strand 18" + SecStr 440..444 + /sec_str_type="sheet" + /note="strand 19" + SecStr 447..453 + /sec_str_type="sheet" + /note="strand 20" + SecStr 455..465 + /sec_str_type="sheet" + /note="strand 21" + Bond bond(461,471) + /bond_type="disulfide" + SecStr 467..475 + /sec_str_type="sheet" + /note="strand 22" + SecStr 478..481 + /sec_str_type="sheet" + /note="strand 23" + SecStr 484..494 + /sec_str_type="sheet" + /note="strand 24" + SecStr 495..498 + /sec_str_type="sheet" + /note="strand 25" + SecStr 507..514 + /sec_str_type="sheet" + /note="strand 26" + SecStr 524..535 + /sec_str_type="sheet" + /note="strand 27" +ORIGIN + 1 gaddvvdssk sfvmenfssy hgtkpgyvds iqkgiqkpks gtqgnydddw kefystdnky + 61 daagysvdne nplsgkaggv vkvtypgltk vlalkvdnae tikkelglsl teplmeqvgt + 121 eefikrfgdg asrvvlslpf aegsssveyi nnweqakals veleinfetr gkrgqdamye + 181 ymaqacagnr vrrsvgssls cinldwdvir dktktkiesl kehgpiknkm sespnktvse + 241 ekakqyleef hqtalehpel selktvtgtn pvfaganyaa wavnvaqvid setadnlekt + 301 taalsilpgi gsvmgiadga vhhnteeiva qsialsslmv aqaiplvgel vdigfaaynf + 361 vesiinlfqv vhnsynrpay spghktqpfl hdgyavswnt vedsiirtgf qgesghdiki + 421 taentplpia gvllptipgk ldvnkskthi svngrkirmr craidgdvtf crpkspvyvg + 481 ngvhanlhva fhrsssekih sneissdsig vlgyqktvdh tkvnsklslf feiks +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353148.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353148.gb new file mode 100644 index 0000000000..f1340266aa --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353148.gb @@ -0,0 +1,184 @@ +LOCUS 4AE1_A 535 aa linear BCT 10-OCT-2012 +DEFINITION Chain A, Diphtheria Toxin. +ACCESSION 4AE1_A +VERSION 4AE1_A +DBSOURCE pdb: molecule 4AE1, chain 65, release Apr 18, 2012; + deposition: Jan 4, 2012; + class: Toxin; + source: Mmdb_id: 98378, Pdb_id 1: 4AE1; + Exp. method: X-Ray Diffraction. +KEYWORDS . +SOURCE Corynebacterium diphtheriae + ORGANISM Corynebacterium diphtheriae + Bacteria; Actinobacteria; Corynebacteriales; Corynebacteriaceae; + Corynebacterium. +REFERENCE 1 (residues 1 to 535) + AUTHORS Malito,E., Bursulaya,B., Chen,C., Surdo,P.L., Picchianti,M., + Balducci,E., Biancucci,M., Brock,A., Berti,F., Bottomley,M.J., + Nissum,M., Costantino,P., Rappuoli,R. and Spraggon,G. + TITLE Structural basis for lack of toxicity of the diphtheria toxin + mutant CRM197 + JOURNAL Proc. Natl. Acad. Sci. U.S.A. 109 (14), 5229-5234 (2012) + PUBMED 22431623 +REFERENCE 2 (residues 1 to 535) + AUTHORS Malito,E. and Spraggon,G. + TITLE Direct Submission + JOURNAL Submitted (04-JAN-2012) +COMMENT Crystal Structure Of Diphtheria Toxin Mutant Crm197 In Complex With + Nicotinamide. +FEATURES Location/Qualifiers + source 1..535 + /organism="Corynebacterium diphtheriae" + /db_xref="taxon:1717" + Region 1..187 + /region_name="Diphtheria_C" + /note="Diphtheria toxin, C domain; pfam02763" + /db_xref="CDD:280859" + Region 1..171 + /region_name="Domain 1" + /note="NCBI Domains" + SecStr 10..15 + /sec_str_type="sheet" + /note="strand 1" + SecStr 16..25 + /sec_str_type="sheet" + /note="strand 2" + Site order(20..24,27,31,34..36,38,53..55,65,148) + /site_type="other" + /note="nad+ binding pocket [chemical binding]" + /db_xref="CDD:238651" + SecStr 52..57 + /sec_str_type="sheet" + /note="strand 3" + SecStr 65..68 + /sec_str_type="sheet" + /note="strand 4" + SecStr 78..86 + /sec_str_type="sheet" + /note="strand 5" + SecStr 87..94 + /sec_str_type="sheet" + /note="strand 6" + SecStr 99..106 + /sec_str_type="helix" + /note="helix 1" + SecStr 120..127 + /sec_str_type="helix" + /note="helix 2" + SecStr 132..139 + /sec_str_type="sheet" + /note="strand 7" + SecStr 147..152 + /sec_str_type="sheet" + /note="strand 8" + SecStr 159..167 + /sec_str_type="sheet" + /note="strand 9" + Region 172..382 + /region_name="Domain 2" + /note="NCBI Domains" + SecStr 176..183 + /sec_str_type="helix" + /note="helix 3" + Bond bond(186,201) + /bond_type="disulfide" + Region 200..379 + /region_name="Diphtheria_T" + /note="Diphtheria toxin, T domain; pfam02764" + /db_xref="CDD:280860" + SecStr 206..220 + /sec_str_type="helix" + /note="helix 4" + SecStr 240..254 + /sec_str_type="helix" + /note="helix 5" + SecStr 260..268 + /sec_str_type="helix" + /note="helix 6" + SecStr 275..288 + /sec_str_type="helix" + /note="helix 7" + SecStr 297..304 + /sec_str_type="helix" + /note="helix 8" + SecStr 315..318 + /sec_str_type="sheet" + /note="strand 10" + SecStr 319..322 + /sec_str_type="sheet" + /note="strand 11" + SecStr 326..343 + /sec_str_type="helix" + /note="helix 9" + SecStr 359..376 + /sec_str_type="helix" + /note="helix 10" + Region 381..534 + /region_name="Diphtheria_R" + /note="Diphtheria toxin, R domain; pfam01324" + /db_xref="CDD:279642" + Region 383..535 + /region_name="Domain 3" + /note="NCBI Domains" + SecStr 388..392 + /sec_str_type="sheet" + /note="strand 12" + SecStr 393..399 + /sec_str_type="sheet" + /note="strand 13" + SecStr 404..408 + /sec_str_type="sheet" + /note="strand 14" + SecStr 409..413 + /sec_str_type="sheet" + /note="strand 15" + SecStr 414..423 + /sec_str_type="sheet" + /note="strand 16" + SecStr 424..427 + /sec_str_type="sheet" + /note="strand 17" + SecStr 428..436 + /sec_str_type="sheet" + /note="strand 18" + SecStr 440..444 + /sec_str_type="sheet" + /note="strand 19" + SecStr 447..453 + /sec_str_type="sheet" + /note="strand 20" + SecStr 455..465 + /sec_str_type="sheet" + /note="strand 21" + Bond bond(461,471) + /bond_type="disulfide" + SecStr 467..475 + /sec_str_type="sheet" + /note="strand 22" + SecStr 478..481 + /sec_str_type="sheet" + /note="strand 23" + SecStr 484..494 + /sec_str_type="sheet" + /note="strand 24" + SecStr 495..498 + /sec_str_type="sheet" + /note="strand 25" + SecStr 507..514 + /sec_str_type="sheet" + /note="strand 26" + SecStr 524..535 + /sec_str_type="sheet" + /note="strand 27" +ORIGIN + 1 gaddvvdssk sfvmenfssy hgtkpgyvds iqkgiqkpks gtqgnydddw kefystdnky + 61 daagysvdne nplsgkaggv vkvtypgltk vlalkvdnae tikkelglsl teplmeqvgt + 121 eefikrfgdg asrvvlslpf aegsssveyi nnweqakals veleinfetr gkrgqdamye + 181 ymaqacagnr vrrsvgssls cinldwdvir dktktkiesl kehgpiknkm sespnktvse + 241 ekakqyleef hqtalehpel selktvtgtn pvfaganyaa wavnvaqvid setadnlekt + 301 taalsilpgi gsvmgiadga vhhnteeiva qsialsslmv aqaiplvgel vdigfaaynf + 361 vesiinlfqv vhnsynrpay spghktqpfl hdgyavswnt vedsiirtgf qgesghdiki + 421 taentplpia gvllptipgk ldvnkskthi svngrkirmr craidgdvtf crpkspvyvg + 481 ngvhanlhva fhrsssekih sneissdsig vlgyqktvdh tkvnsklslf feiks +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353149.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353149.gb new file mode 100644 index 0000000000..4237b720e4 --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/381353149.gb @@ -0,0 +1,187 @@ +LOCUS 4AE1_B 535 aa linear BCT 10-OCT-2012 +DEFINITION Chain B, Diphtheria Toxin. +ACCESSION 4AE1_B +VERSION 4AE1_B +DBSOURCE pdb: molecule 4AE1, chain 66, release Apr 18, 2012; + deposition: Jan 4, 2012; + class: Toxin; + source: Mmdb_id: 98378, Pdb_id 1: 4AE1; + Exp. method: X-Ray Diffraction. +KEYWORDS . +SOURCE Corynebacterium diphtheriae + ORGANISM Corynebacterium diphtheriae + Bacteria; Actinobacteria; Corynebacteriales; Corynebacteriaceae; + Corynebacterium. +REFERENCE 1 (residues 1 to 535) + AUTHORS Malito,E., Bursulaya,B., Chen,C., Surdo,P.L., Picchianti,M., + Balducci,E., Biancucci,M., Brock,A., Berti,F., Bottomley,M.J., + Nissum,M., Costantino,P., Rappuoli,R. and Spraggon,G. + TITLE Structural basis for lack of toxicity of the diphtheria toxin + mutant CRM197 + JOURNAL Proc. Natl. Acad. Sci. U.S.A. 109 (14), 5229-5234 (2012) + PUBMED 22431623 +REFERENCE 2 (residues 1 to 535) + AUTHORS Malito,E. and Spraggon,G. + TITLE Direct Submission + JOURNAL Submitted (04-JAN-2012) +COMMENT Crystal Structure Of Diphtheria Toxin Mutant Crm197 In Complex With + Nicotinamide. +FEATURES Location/Qualifiers + source 1..535 + /organism="Corynebacterium diphtheriae" + /db_xref="taxon:1717" + Region 1..187 + /region_name="Diphtheria_C" + /note="Diphtheria toxin, C domain; pfam02763" + /db_xref="CDD:280859" + Region 1..171 + /region_name="Domain 4" + /note="NCBI Domains" + SecStr 10..15 + /sec_str_type="sheet" + /note="strand 28" + SecStr 16..22 + /sec_str_type="sheet" + /note="strand 29" + Site order(20..24,27,31,34..36,38,53..55,65,148) + /site_type="other" + /note="nad+ binding pocket [chemical binding]" + /db_xref="CDD:238651" + SecStr 54..57 + /sec_str_type="sheet" + /note="strand 30" + SecStr 65..68 + /sec_str_type="sheet" + /note="strand 31" + SecStr 78..86 + /sec_str_type="sheet" + /note="strand 32" + SecStr 87..94 + /sec_str_type="sheet" + /note="strand 33" + SecStr 99..106 + /sec_str_type="helix" + /note="helix 11" + SecStr 120..127 + /sec_str_type="helix" + /note="helix 12" + SecStr 132..139 + /sec_str_type="sheet" + /note="strand 34" + SecStr 147..152 + /sec_str_type="sheet" + /note="strand 35" + SecStr 159..167 + /sec_str_type="sheet" + /note="strand 36" + Region 172..382 + /region_name="Domain 5" + /note="NCBI Domains" + SecStr 176..183 + /sec_str_type="helix" + /note="helix 13" + Bond bond(186,201) + /bond_type="disulfide" + Region 200..379 + /region_name="Diphtheria_T" + /note="Diphtheria toxin, T domain; pfam02764" + /db_xref="CDD:280860" + SecStr 206..223 + /sec_str_type="helix" + /note="helix 14" + SecStr 224..232 + /sec_str_type="helix" + /note="helix 15" + SecStr 240..254 + /sec_str_type="helix" + /note="helix 16" + SecStr 260..268 + /sec_str_type="helix" + /note="helix 17" + SecStr 275..288 + /sec_str_type="helix" + /note="helix 18" + SecStr 297..304 + /sec_str_type="helix" + /note="helix 19" + SecStr 315..318 + /sec_str_type="sheet" + /note="strand 37" + SecStr 319..322 + /sec_str_type="sheet" + /note="strand 38" + SecStr 326..343 + /sec_str_type="helix" + /note="helix 20" + SecStr 359..376 + /sec_str_type="helix" + /note="helix 21" + Region 381..534 + /region_name="Diphtheria_R" + /note="Diphtheria toxin, R domain; pfam01324" + /db_xref="CDD:279642" + Region 383..535 + /region_name="Domain 6" + /note="NCBI Domains" + SecStr 388..392 + /sec_str_type="sheet" + /note="strand 39" + SecStr 393..399 + /sec_str_type="sheet" + /note="strand 40" + SecStr 404..408 + /sec_str_type="sheet" + /note="strand 41" + SecStr 409..413 + /sec_str_type="sheet" + /note="strand 42" + SecStr 414..423 + /sec_str_type="sheet" + /note="strand 43" + SecStr 424..427 + /sec_str_type="sheet" + /note="strand 44" + SecStr 428..436 + /sec_str_type="sheet" + /note="strand 45" + SecStr 440..444 + /sec_str_type="sheet" + /note="strand 46" + SecStr 447..453 + /sec_str_type="sheet" + /note="strand 47" + SecStr 455..465 + /sec_str_type="sheet" + /note="strand 48" + Bond bond(461,471) + /bond_type="disulfide" + SecStr 467..475 + /sec_str_type="sheet" + /note="strand 49" + SecStr 478..481 + /sec_str_type="sheet" + /note="strand 50" + SecStr 484..494 + /sec_str_type="sheet" + /note="strand 51" + SecStr 495..498 + /sec_str_type="sheet" + /note="strand 52" + SecStr 507..514 + /sec_str_type="sheet" + /note="strand 53" + SecStr 524..535 + /sec_str_type="sheet" + /note="strand 54" +ORIGIN + 1 gaddvvdssk sfvmenfssy hgtkpgyvds iqkgiqkpks gtqgnydddw kefystdnky + 61 daagysvdne nplsgkaggv vkvtypgltk vlalkvdnae tikkelglsl teplmeqvgt + 121 eefikrfgdg asrvvlslpf aegsssveyi nnweqakals veleinfetr gkrgqdamye + 181 ymaqacagnr vrrsvgssls cinldwdvir dktktkiesl kehgpiknkm sespnktvse + 241 ekakqyleef hqtalehpel selktvtgtn pvfaganyaa wavnvaqvid setadnlekt + 301 taalsilpgi gsvmgiadga vhhnteeiva qsialsslmv aqaiplvgel vdigfaaynf + 361 vesiinlfqv vhnsynrpay spghktqpfl hdgyavswnt vedsiirtgf qgesghdiki + 421 taentplpia gvllptipgk ldvnkskthi svngrkirmr craidgdvtf crpkspvyvg + 481 ngvhanlhva fhrsssekih sneissdsig vlgyqktvdh tkvnsklslf feiks +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/399235158.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/399235158.gb new file mode 100644 index 0000000000..ceed26194e --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/399235158.gb @@ -0,0 +1,83 @@ +LOCUS AFP42651 630 aa linear BCT 31-JAN-2014 +DEFINITION Fatty-acid-CoA ligase FadD32 [Mycolicibacterium smegmatis MC2 155]. +ACCESSION AFP42651 +VERSION AFP42651.1 +DBLINK BioProject: PRJNA38453 + BioSample: SAMN02603392 +DBSOURCE accession CP001663.1 +KEYWORDS . +SOURCE Mycolicibacterium smegmatis MC2 155 + ORGANISM Mycolicibacterium smegmatis MC2 155 + Bacteria; Actinobacteria; Corynebacteriales; Mycobacteriaceae; + Mycolicibacterium. +REFERENCE 1 (residues 1 to 630) + AUTHORS Perrodou,E., Deshayes,C., Muller,J., Schaeffer,C., Van + Dorsselaer,A., Ripp,R., Poch,O., Reyrat,J.M. and Lecompte,O. + TITLE ICDS database: interrupted CoDing sequences in prokaryotic genomes + JOURNAL Nucleic Acids Res. 34 (DATABASE ISSUE), D338-D343 (2006) + PUBMED 16381882 +REFERENCE 2 (residues 1 to 630) + AUTHORS Deshayes,C., Perrodou,E., Gallien,S., Euphrasie,D., Schaeffer,C., + Van-Dorsselaer,A., Poch,O., Lecompte,O. and Reyrat,J.M. + TITLE Interrupted coding sequences in Mycobacterium smegmatis: authentic + mutations or sequencing errors? + JOURNAL Genome Biol. 8 (2), R20 (2007) + PUBMED 17295914 + REMARK Publication Status: Online-Only +REFERENCE 3 (residues 1 to 630) + AUTHORS Gallien,S., Perrodou,E., Carapito,C., Deshayes,C., Reyrat,J.M., Van + Dorsselaer,A., Poch,O., Schaeffer,C. and Lecompte,O. + TITLE Ortho-proteogenomics: multiple proteomes investigation through + orthology and a new MS-based protocol + JOURNAL Genome Res. 19 (1), 128-135 (2009) + PUBMED 18955433 +REFERENCE 4 (residues 1 to 630) + AUTHORS Reyrat,J.M., Perrodou,E., Deshayes,C., Euphrasie,D., Gagniere,N., + Gallien,S., Jones,M., Kocincova,D., Poch,O., Quevillon,E., Ripp,R., + Schaeffer,C., Singh,A., Van Dorsselaer,A. and Lecompte,O. + TITLE Re-annotation of the genome sequence of Mycobacterium smegmatis + JOURNAL Unpublished +REFERENCE 5 (residues 1 to 630) + AUTHORS Perrodou,E., Reyrat,J.M., Deshayes,C., Euphrasie,D., Gagniere,N., + Gallien,S., Jones,M., Kocincova,D., Poch,O., Quevillon,E., Ripp,R., + Schaeffer,C., Singh,A., Van Dorsselaer,A. and Lecompte,O. + TITLE Direct Submission + JOURNAL Submitted (22-JUN-2009) Laboratory of Integrative Bioinformatics + and Genomics, Institute of Genetics and Molecular and Cellular + Biology, 1 rue Laurent Fries BP 10142, Illkirch Cedex 67404, France +COMMENT Method: conceptual translation. +FEATURES Location/Qualifiers + source 1..630 + /organism="Mycolicibacterium smegmatis MC2 155" + /strain="MC2 155" + /db_xref="taxon:246196" + Protein 1..630 + /product="Fatty-acid-CoA ligase FadD32" + Region 1..630 + /region_name="PRK07769" + /note="long-chain-fatty-acid--CoA ligase; Validated" + /db_xref="CDD:181109" + CDS 1..630 + /gene="fadD32" + /locus_tag="MSMEI_6225" + /coded_by="complement(CP001663.1:6463934..6465826)" + /experiment="Nterminal peptide experimentally determined + by amino acid sequencing after protein digestion" + /note="GO_function: GO:0003824; + GO_process: GO:0008152" + /transl_table=11 + /db_xref="PFAM:PF00501" +ORIGIN + 1 mpfhnpfikd gqikfpdgss ivahverwak vrgdklayrf ldfsterdgv prdltwaqfs + 61 arnravaarl qqvtqpgdrv ailcpqnldy lvaffgalya griavplfdp sepghvgrlh + 121 avldnchpsa ilttteaaeg vrkffrtrpa nqrprviavd avpddvastw vnpdepdett + 181 iaylqytsgs triptgvqit hlnlatnvvq viealegeeg drglswlpff hdmglitall + 241 apmighyftf mtpaafvrrp erwirelark egdtggtisv apnfafdhaa argvpkpgsp + 301 pldlsnvkav lngsepisaa tvrrfneafg pfgfppkaik psyglaeatl fvsttpsaee + 361 pkiitvdrdq lnsgrivevd adspkavaqa sagkvgiaew avivdaesat elpdgqvgei + 421 wisgqnmgtg ywgkpeesva tfqnilksrt npshaegatd datwvrtgdy gafydgdlyi + 481 tgrvkdlvii dgrnhypqdl eysaqeaska irtgyvaafs vpanqlpdev fenahsgikr + 541 dpddtseqlv ivaerapgah kldigpitdd iraaiavrhg vtvrdvllta agaiprtssg + 601 kigrracraa yldgslragk vandfpdatd +// + diff --git a/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/7525057.gb b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/7525057.gb new file mode 100644 index 0000000000..1eccc2480d --- /dev/null +++ b/biojava-core/src/test/resources/org/biojava/nbio/core/sequence/loader/7525057.gb @@ -0,0 +1,87 @@ +LOCUS NP_051038 123 aa linear PLN 26-MAR-2010 +DEFINITION ribosomal protein S12 (chloroplast) [Arabidopsis thaliana]. +ACCESSION NP_051038 +VERSION NP_051038.1 +DBLINK Project: 116 + BioProject: PRJNA116 +DBSOURCE REFSEQ: accession NC_000932.1 +KEYWORDS RefSeq. +SOURCE chloroplast Arabidopsis thaliana (thale cress) + ORGANISM Arabidopsis thaliana + Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; + Spermatophyta; Magnoliophyta; eudicotyledons; Gunneridae; + Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; + Camelineae; Arabidopsis. +REFERENCE 1 (residues 1 to 123) + AUTHORS Sato,S., Nakamura,Y., Kaneko,T., Asamizu,E. and Tabata,S. + TITLE Complete structure of the chloroplast genome of Arabidopsis + thaliana + JOURNAL DNA Res. 6 (5), 283-290 (1999) + PUBMED 10574454 +REFERENCE 2 (residues 1 to 123) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (07-APR-2000) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 3 (residues 1 to 123) + AUTHORS Nakamura,Y. + TITLE Direct Submission + JOURNAL Submitted (09-SEP-1999) Laboratory of Gene Structure 2, Kazusa DNA + Research Institute, Yana 1532-3, Kisarazu, Chiba 292-0812, Japan +COMMENT REVIEWED REFSEQ: This record has been curated by NCBI staff. The + reference sequence was derived from BAA84409. + Method: conceptual translation. +FEATURES Location/Qualifiers + source 1..123 + /organism="Arabidopsis thaliana" + /organelle="plastid:chloroplast" + /db_xref="taxon:3702" + /ecotype="Columbia" + Protein 1..123 + /product="ribosomal protein S12" + /calculated_mol_wt=13633 + Region 1..123 + /region_name="rps12" + /note="ribosomal protein S12; CHL00051" + /db_xref="CDD:176992" + Site order(4..5,7..8,11..12) + /site_type="other" + /note="S17 interaction site [polypeptide binding]" + /db_xref="CDD:239466" + Site 4 + /site_type="other" + /note="S8 interaction site" + /db_xref="CDD:239466" + Site order(12..14,26,28..29,31,46..47,49..51,58,66,69..70, + 83..84,88..89,110) + /site_type="other" + /note="16S rRNA interaction site [nucleotide binding]" + /db_xref="CDD:239466" + Site order(43..44,88) + /site_type="other" + /note="streptomycin interaction site [chemical binding]" + /db_xref="CDD:239466" + Site 44..45 + /site_type="other" + /note="23S rRNA interaction site [nucleotide binding]" + /db_xref="CDD:239466" + Site order(45..50,70..78) + /site_type="other" + /note="aminoacyl-tRNA interaction site (A-site) + [nucleotide binding]" + /db_xref="CDD:239466" + CDS 1..123 + /gene="rps12" + /locus_tag="ArthCp047" + /coded_by="join(complement(NC_000932.1:69611..69724), + NC_000932.1:139856..140087,NC_000932.1:140625..140650)" + /trans_splicing + /note="trans-spliced" + /transl_table=11 + /db_xref="GeneID:844801" +ORIGIN + 1 mptikqlirn trqpirnvtk spalrgcpqr rgtctrvyti tpkkpnsalr kvarvrltsg + 61 feitayipgi ghnlqehsvv lvrggrvkdl pgvryhivrg tldavgvkdr qqgrskygvk + 121 kpk +// +