Skip to content

Commit bb83b56

Browse files
committed
clean up
Signed-off-by: aalhossary <amr_alhossary@hotmail.com>
1 parent 9bf325e commit bb83b56

1 file changed

Lines changed: 41 additions & 171 deletions

File tree

biojava3-core/src/main/java/org/biojava3/core/sequence/io/PlainFastaHeaderParser.java

Lines changed: 41 additions & 171 deletions
Original file line numberDiff line numberDiff line change
@@ -21,184 +21,54 @@
2121
*/
2222
package org.biojava3.core.sequence.io;
2323

24-
import java.util.ArrayList;
25-
2624
import org.biojava3.core.sequence.AccessionID;
27-
import org.biojava3.core.sequence.DataSource;
28-
import org.biojava3.core.sequence.ProteinSequence;
29-
import org.biojava3.core.sequence.compound.AminoAcidCompound;
3025
import org.biojava3.core.sequence.io.template.FastaHeaderParserInterface;
3126
import org.biojava3.core.sequence.template.AbstractSequence;
3227
import org.biojava3.core.sequence.template.Compound;
33-
import org.biojava3.core.sequence.template.AbstractSequence.AnnotationType;
3428

3529
/**
36-
* The plain fasta header takes everything in the header as a single entity.
37-
* It is useful for non-standard header formats that don't follow a single rule.<br>
38-
* If the user has a custom header with local data that is kept constant all over the data
39-
* then they can create their own implementation of a FastaHeaderParserInterface
30+
* The plain fasta header takes everything in the header as a single entity. It
31+
* is useful for non-standard header formats that don't follow a single rule.<br>
32+
* If the user has a custom header with local data that is kept constant all
33+
* over the data then they can create their own implementation of a
34+
* FastaHeaderParserInterface
35+
*
4036
* @author Amr AL-Hossary
4137
* @since 3.0.6
4238
*/
43-
public class PlainFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements FastaHeaderParserInterface<S,C> {
44-
45-
/**
46-
* Parse out the all header as one entity
47-
* @param header
48-
* @return
49-
*/
50-
private String[] getHeaderValues(String header) {
51-
return new String[]{header};
52-
}
53-
54-
/**
55-
* Parse the header and set the values in the sequence
56-
* @param header
57-
* @param sequence
58-
*/
59-
public void parseHeader(String header, S sequence) {
60-
//uniptrot
61-
// tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
62-
sequence.setOriginalHeader(header);
63-
String[] data = getHeaderValues(header);
64-
65-
if (data.length == 1) {
66-
sequence.setAccession(new AccessionID(data[0]));
67-
} else if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) {
68-
if (data[0].equalsIgnoreCase("sp")) {
69-
sequence.setAnnotationType(AnnotationType.CURATED);
70-
} else {
71-
sequence.setAnnotationType(AnnotationType.PREDICTED);
72-
}
73-
74-
sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
75-
if (data.length > 1) {
76-
sequence.setDescription(data[2]);
77-
}
78-
79-
} else if (data[0].equalsIgnoreCase("gi")) {
80-
DataSource giSource = DataSource.UNKNOWN;
81-
if (data.length >= 3) {
82-
if (data[2].equalsIgnoreCase("gb")) {
83-
giSource = DataSource.GENBANK;
84-
} else if (data[2].equalsIgnoreCase("emb")) {
85-
giSource = DataSource.ENA;
86-
} else if (data[2].equalsIgnoreCase("dbj")) {
87-
giSource = DataSource.DDBJ;
88-
}
89-
sequence.setAccession(new AccessionID(data[3], giSource));
90-
} else {
91-
sequence.setAccession(new AccessionID(header, giSource));
92-
}
93-
} else if (data[0].equalsIgnoreCase("pir")) {
94-
sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
95-
} else if (data[0].equalsIgnoreCase("prf")) {
96-
sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
97-
} else if (data[0].equalsIgnoreCase("pdb")) {
98-
sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
99-
} else if (data[0].startsWith("PDB")) {
100-
String[] pdbe = data[0].split(" ");
101-
String[] pdbaccession = pdbe[0].split(":");
102-
sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
103-
} else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) {
104-
sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
105-
} else if (data[0].equalsIgnoreCase("pat")) {
106-
sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
107-
} else if (data[0].equalsIgnoreCase("bbs")) {
108-
sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
109-
} else if (data[0].equalsIgnoreCase("gnl")) {
110-
sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
111-
} else if (data[0].equalsIgnoreCase("ref")) {
112-
sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
113-
} else if (data[0].equalsIgnoreCase("lcl")) {
114-
sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
115-
} else {
116-
sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
117-
}
118-
119-
120-
}
121-
122-
/**
123-
*
124-
* @param args
125-
*/
126-
public static void main(String[] args) {
127-
128-
System.out.println("parseHeader");
129-
String header = "";
130-
ProteinSequence sequence = new ProteinSequence("");
131-
PlainFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance =
132-
new PlainFastaHeaderParser<ProteinSequence,AminoAcidCompound>();
133-
134-
header = "gi|gi-number|gb|accession|locus";
135-
instance.parseHeader(header, sequence);
136-
System.out.println("accession" + "=" + sequence.getAccession());
137-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENBANK);
138-
139-
header = "gi|gi-number|emb|accession|locus";
140-
instance.parseHeader(header, sequence);
141-
System.out.println("accession" + "=" + sequence.getAccession());
142-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.ENA);
143-
144-
header = "gi|gi-number|dbj|accession|locus";
145-
instance.parseHeader(header, sequence);
146-
System.out.println("accession" + "=" + sequence.getAccession());
147-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.DDBJ);
148-
149-
header = "pir||entry";
150-
instance.parseHeader(header, sequence);
151-
System.out.println("entry" + "=" + sequence.getAccession());
152-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NBRF);
153-
154-
header = "prf||name";
155-
instance.parseHeader(header, sequence);
156-
System.out.println("name" + "=" + sequence.getAccession());
157-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PRF);
158-
159-
header = "sp|accession|name";
160-
instance.parseHeader(header, sequence);
161-
System.out.println("accession" + "=" + sequence.getAccession());
162-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.UNIPROT);
163-
164-
header = "pdb|entry|chain";
165-
instance.parseHeader(header, sequence);
166-
System.out.println("entry:chain" + "=" + sequence.getAccession());
167-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB1);
168-
169-
header = "entry:chain|PDBID|CHAIN|SEQUENCE";
170-
instance.parseHeader(header, sequence);
171-
System.out.println("entry:chain" + "=" + sequence.getAccession());
172-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDB2);
173-
header = "PDB:1ECY_A mol:protein length:142 ECOTIN";
174-
instance.parseHeader(header, sequence);
175-
System.out.println("1ECY_A" + "=" + sequence.getAccession());
176-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PDBe);
177-
178-
header = "pat|country|number";
179-
instance.parseHeader(header, sequence);
180-
System.out.println("number" + "=" + sequence.getAccession());
181-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.PATENTS);
182-
183-
header = "bbs|number";
184-
instance.parseHeader(header, sequence);
185-
System.out.println("number" + "=" + sequence.getAccession());
186-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENINFO);
187-
188-
header = "gnl|database|identifier";
189-
instance.parseHeader(header, sequence);
190-
System.out.println("identifier" + "=" + sequence.getAccession());
191-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.GENERAL);
192-
193-
header = "ref|accession|locus";
194-
195-
instance.parseHeader(header, sequence);
196-
System.out.println("accession" + "=" + sequence.getAccession());
197-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.NCBI);
198-
199-
header = "lcl|identifier";
200-
instance.parseHeader(header, sequence);
201-
System.out.println("identifier" + "=" + sequence.getAccession());
202-
System.out.println(sequence.getAccession().getDataSource() + "=" + DataSource.LOCAL);
203-
}
39+
public class PlainFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound>
40+
implements FastaHeaderParserInterface<S, C> {
41+
42+
/**
43+
* Parse out the all header as one entity
44+
*
45+
* @param header
46+
* @return
47+
*/
48+
private String[] getHeaderValues(String header) {
49+
return new String[] { header };
50+
}
51+
52+
/**
53+
* Parse the header and set the values in the sequence
54+
*
55+
* @param header
56+
* @param sequence
57+
*/
58+
@Override
59+
public void parseHeader(String header, S sequence) {
60+
// uniptrot
61+
// tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein
62+
// OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4
63+
// SV=1
64+
sequence.setOriginalHeader(header);
65+
String[] data = getHeaderValues(header);
66+
67+
if (data.length == 1) {
68+
sequence.setAccession(new AccessionID(data[0]));
69+
} else {
70+
throw new RuntimeException(
71+
"No header or Some Error Occurred while reading header");
72+
}
73+
}
20474
}

0 commit comments

Comments
 (0)