|
21 | 21 | */ |
22 | 22 | package org.biojava3.core.sequence.io; |
23 | 23 |
|
24 | | -import java.util.ArrayList; |
25 | | - |
26 | 24 | import org.biojava3.core.sequence.AccessionID; |
27 | | -import org.biojava3.core.sequence.DataSource; |
28 | | -import org.biojava3.core.sequence.ProteinSequence; |
29 | | -import org.biojava3.core.sequence.compound.AminoAcidCompound; |
30 | 25 | import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface; |
31 | 26 | import org.biojava3.core.sequence.template.AbstractSequence; |
32 | 27 | import org.biojava3.core.sequence.template.Compound; |
33 | | -import org.biojava3.core.sequence.template.AbstractSequence.AnnotationType; |
34 | 28 |
|
35 | 29 | /** |
36 | | - * The plain fasta header takes everything in the header as a single entity. |
37 | | - * It is useful for non-standard header formats that don't follow a single rule.<br> |
38 | | - * If the user has a custom header with local data that is kept constant all over the data |
39 | | - * then they can create their own implementation of a FastaHeaderParserInterface |
| 30 | + * The plain fasta header takes everything in the header as a single entity. It |
| 31 | + * is useful for non-standard header formats that don't follow a single rule.<br> |
| 32 | + * If the user has a custom header with local data that is kept constant all |
| 33 | + * over the data then they can create their own implementation of a |
| 34 | + * FastaHeaderParserInterface |
| 35 | + * |
40 | 36 | * @author Amr AL-Hossary |
41 | 37 | * @since 3.0.6 |
42 | 38 | */ |
43 | | -public class PlainFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements FastaHeaderParserInterface<S,C> { |
44 | | - |
45 | | - /** |
46 | | - * Parse out the all header as one entity |
47 | | - * @param header |
48 | | - * @return |
49 | | - */ |
50 | | - private String[] getHeaderValues(String header) { |
51 | | - return new String[]{header}; |
52 | | - } |
53 | | - |
54 | | - /** |
55 | | - * Parse the header and set the values in the sequence |
56 | | - * @param header |
57 | | - * @param sequence |
58 | | - */ |
59 | | - public void parseHeader(String header, S sequence) { |
60 | | - //uniptrot |
61 | | - // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1 |
62 | | - sequence.setOriginalHeader(header); |
63 | | - String[] data = getHeaderValues(header); |
64 | | - |
65 | | - if (data.length == 1) { |
66 | | - sequence.setAccession(new AccessionID(data[0])); |
67 | | - } else if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) { |
68 | | - if (data[0].equalsIgnoreCase("sp")) { |
69 | | - sequence.setAnnotationType(AnnotationType.CURATED); |
70 | | - } else { |
71 | | - sequence.setAnnotationType(AnnotationType.PREDICTED); |
72 | | - } |
73 | | - |
74 | | - sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT)); |
75 | | - if (data.length > 1) { |
76 | | - sequence.setDescription(data[2]); |
77 | | - } |
78 | | - |
79 | | - } else if (data[0].equalsIgnoreCase("gi")) { |
80 | | - DataSource giSource = DataSource.UNKNOWN; |
81 | | - if (data.length >= 3) { |
82 | | - if (data[2].equalsIgnoreCase("gb")) { |
83 | | - giSource = DataSource.GENBANK; |
84 | | - } else if (data[2].equalsIgnoreCase("emb")) { |
85 | | - giSource = DataSource.ENA; |
86 | | - } else if (data[2].equalsIgnoreCase("dbj")) { |
87 | | - giSource = DataSource.DDBJ; |
88 | | - } |
89 | | - sequence.setAccession(new AccessionID(data[3], giSource)); |
90 | | - } else { |
91 | | - sequence.setAccession(new AccessionID(header, giSource)); |
92 | | - } |
93 | | - } else if (data[0].equalsIgnoreCase("pir")) { |
94 | | - sequence.setAccession(new AccessionID(data[2], DataSource.NBRF)); |
95 | | - } else if (data[0].equalsIgnoreCase("prf")) { |
96 | | - sequence.setAccession(new AccessionID(data[2], DataSource.PRF)); |
97 | | - } else if (data[0].equalsIgnoreCase("pdb")) { |
98 | | - sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1)); |
99 | | - } else if (data[0].startsWith("PDB")) { |
100 | | - String[] pdbe = data[0].split(" "); |
101 | | - String[] pdbaccession = pdbe[0].split(":"); |
102 | | - sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe)); |
103 | | - } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) { |
104 | | - sequence.setAccession(new AccessionID(data[0], DataSource.PDB2)); |
105 | | - } else if (data[0].equalsIgnoreCase("pat")) { |
106 | | - sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS)); |
107 | | - } else if (data[0].equalsIgnoreCase("bbs")) { |
108 | | - sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO)); |
109 | | - } else if (data[0].equalsIgnoreCase("gnl")) { |
110 | | - sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL)); |
111 | | - } else if (data[0].equalsIgnoreCase("ref")) { |
112 | | - sequence.setAccession(new AccessionID(data[1], DataSource.NCBI)); |
113 | | - } else if (data[0].equalsIgnoreCase("lcl")) { |
114 | | - sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL)); |
115 | | - } else { |
116 | | - sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader |
117 | | - } |
118 | | - |
119 | | - |
120 | | - } |
121 | | - |
122 | | - /** |
123 | | - * |
124 | | - * @param args |
125 | | - */ |
126 | | - public static void main(String[] args) { |
127 | | - |
128 | | - System.out.println("parseHeader"); |
129 | | - String header = ""; |
130 | | - ProteinSequence sequence = new ProteinSequence(""); |
131 | | - PlainFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance = |
132 | | - new PlainFastaHeaderParser<ProteinSequence,AminoAcidCompound>(); |
133 | | - |
134 | | - header = "gi|gi-number|gb|accession|locus"; |
135 | | - instance.parseHeader(header, sequence); |
136 | | - System.out.println("accession" + "=" + sequence.getAccession()); |
137 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENBANK); |
138 | | - |
139 | | - header = "gi|gi-number|emb|accession|locus"; |
140 | | - instance.parseHeader(header, sequence); |
141 | | - System.out.println("accession" + "=" + sequence.getAccession()); |
142 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.ENA); |
143 | | - |
144 | | - header = "gi|gi-number|dbj|accession|locus"; |
145 | | - instance.parseHeader(header, sequence); |
146 | | - System.out.println("accession" + "=" + sequence.getAccession()); |
147 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.DDBJ); |
148 | | - |
149 | | - header = "pir||entry"; |
150 | | - instance.parseHeader(header, sequence); |
151 | | - System.out.println("entry" + "=" + sequence.getAccession()); |
152 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NBRF); |
153 | | - |
154 | | - header = "prf||name"; |
155 | | - instance.parseHeader(header, sequence); |
156 | | - System.out.println("name" + "=" + sequence.getAccession()); |
157 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PRF); |
158 | | - |
159 | | - header = "sp|accession|name"; |
160 | | - instance.parseHeader(header, sequence); |
161 | | - System.out.println("accession" + "=" + sequence.getAccession()); |
162 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.UNIPROT); |
163 | | - |
164 | | - header = "pdb|entry|chain"; |
165 | | - instance.parseHeader(header, sequence); |
166 | | - System.out.println("entry:chain" + "=" + sequence.getAccession()); |
167 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB1); |
168 | | - |
169 | | - header = "entry:chain|PDBID|CHAIN|SEQUENCE"; |
170 | | - instance.parseHeader(header, sequence); |
171 | | - System.out.println("entry:chain" + "=" + sequence.getAccession()); |
172 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB2); |
173 | | - header = "PDB:1ECY_A mol:protein length:142 ECOTIN"; |
174 | | - instance.parseHeader(header, sequence); |
175 | | - System.out.println("1ECY_A" + "=" + sequence.getAccession()); |
176 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDBe); |
177 | | - |
178 | | - header = "pat|country|number"; |
179 | | - instance.parseHeader(header, sequence); |
180 | | - System.out.println("number" + "=" + sequence.getAccession()); |
181 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PATENTS); |
182 | | - |
183 | | - header = "bbs|number"; |
184 | | - instance.parseHeader(header, sequence); |
185 | | - System.out.println("number" + "=" + sequence.getAccession()); |
186 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENINFO); |
187 | | - |
188 | | - header = "gnl|database|identifier"; |
189 | | - instance.parseHeader(header, sequence); |
190 | | - System.out.println("identifier" + "=" + sequence.getAccession()); |
191 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENERAL); |
192 | | - |
193 | | - header = "ref|accession|locus"; |
194 | | - |
195 | | - instance.parseHeader(header, sequence); |
196 | | - System.out.println("accession" + "=" + sequence.getAccession()); |
197 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NCBI); |
198 | | - |
199 | | - header = "lcl|identifier"; |
200 | | - instance.parseHeader(header, sequence); |
201 | | - System.out.println("identifier" + "=" + sequence.getAccession()); |
202 | | - System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.LOCAL); |
203 | | - } |
| 39 | +public class PlainFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> |
| 40 | + implements FastaHeaderParserInterface<S, C> { |
| 41 | + |
| 42 | + /** |
| 43 | + * Parse out the all header as one entity |
| 44 | + * |
| 45 | + * @param header |
| 46 | + * @return |
| 47 | + */ |
| 48 | + private String[] getHeaderValues(String header) { |
| 49 | + return new String[] { header }; |
| 50 | + } |
| 51 | + |
| 52 | + /** |
| 53 | + * Parse the header and set the values in the sequence |
| 54 | + * |
| 55 | + * @param header |
| 56 | + * @param sequence |
| 57 | + */ |
| 58 | + @Override |
| 59 | + public void parseHeader(String header, S sequence) { |
| 60 | + // uniptrot |
| 61 | + // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein |
| 62 | + // OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 |
| 63 | + // SV=1 |
| 64 | + sequence.setOriginalHeader(header); |
| 65 | + String[] data = getHeaderValues(header); |
| 66 | + |
| 67 | + if (data.length == 1) { |
| 68 | + sequence.setAccession(new AccessionID(data[0])); |
| 69 | + } else { |
| 70 | + throw new RuntimeException( |
| 71 | + "No header or Some Error Occurred while reading header"); |
| 72 | + } |
| 73 | + } |
204 | 74 | } |
0 commit comments