Skip to content

Commit 24fab31

Browse files
committed
Add of BlastTabularParser
1 parent ef99f09 commit 24fab31

File tree

4 files changed

+118249
-0
lines changed

4 files changed

+118249
-0
lines changed
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
* To change this license header, choose License Headers in Project Properties.
3+
* To change this template file, choose Tools | Templates
4+
* and open the template in the editor.
5+
*/
6+
7+
package org.biojava.nbio.core.search.io.blast;
8+
9+
import java.io.File;
10+
import java.io.FileInputStream;
11+
import java.io.FileReader;
12+
import java.io.LineNumberReader;
13+
import java.util.ArrayList;
14+
import java.util.HashMap;
15+
import java.util.LinkedHashMap;
16+
import java.util.List;
17+
import java.util.Scanner;
18+
import java.util.logging.Logger;
19+
import org.biojava.nbio.core.exceptions.ParserException;
20+
import org.biojava.nbio.core.search.io.Hit;
21+
import org.biojava.nbio.core.search.io.Hsp;
22+
import org.biojava.nbio.core.search.io.Result;
23+
import org.biojava.nbio.core.search.io.ResultFactory;
24+
25+
/**
26+
*
27+
* @author Paolo Pavan, Genomnia srl
28+
* https://it.linkedin.com/pub/paolo-pavan/6/15a/956
29+
*/
30+
public class BlastTabularParser implements ResultFactory {
31+
private final String blastReference =
32+
"Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.";
33+
/**
34+
* Tries to define a different level of consistency during parsing
35+
* LITERAL is intended a strict parsing much tight to the report.
36+
* IMPROVED consistency tries to import data much tight to the data model
37+
* (I hope you got the idea, if not have a look to the code.
38+
* I'm not very sure I will leave to the user the possibility to choose)
39+
*/
40+
private enum PARSING_CONSISTENCY {
41+
IMPROVED,
42+
LITERAL
43+
}
44+
private static final Logger log = Logger.getLogger(BlastTabularParser.class.getName());
45+
46+
private File targetFile;
47+
private int fileLinesCount;
48+
private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED;
49+
50+
51+
// data imported private:
52+
int queryIdNumber = 0;
53+
HashMap<String,String> queryIdMapping = new HashMap();
54+
String programName=null, queryName = null, databaseFile = null;
55+
private String queryId ;
56+
private String subjectId ;
57+
private String percIdentity ;
58+
private String alnLength ;
59+
private String mismatchCount;
60+
private String gapOpenCount ;
61+
private String queryStart ;
62+
private String queryEnd ;
63+
private String subjectStart ;
64+
private String subjectEnd ;
65+
private String evalue ;
66+
private String bitScore ;
67+
68+
69+
@Override
70+
public List<String> getFileExtensions() {
71+
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
72+
}
73+
74+
@Override
75+
public void setFile(File f) {
76+
targetFile = f;
77+
}
78+
79+
@Override
80+
public List<Result> createObjects(double maxEScore) throws Exception {
81+
List<Result> results = new ArrayList();
82+
83+
log.info("Query for hits");
84+
LineNumberReader lnr = new LineNumberReader(new FileReader(targetFile));
85+
lnr.skip(Long.MAX_VALUE);
86+
fileLinesCount = lnr.getLineNumber();
87+
log.info(fileLinesCount + " hits approximately in all results");
88+
lnr.close();
89+
90+
FileInputStream fileInputStream = new FileInputStream(targetFile);
91+
Scanner scanner = new Scanner(fileInputStream);
92+
93+
String line = fetchData(scanner);
94+
while (scanner.hasNext()){
95+
try {
96+
BlastResultBuilder resultBuilder = new BlastResultBuilder();
97+
resultBuilder
98+
.setQueryID(queryId)
99+
.setDbFile(databaseFile)
100+
.setProgram(programName)
101+
.setQueryDef(queryName)
102+
.setReference(blastReference);
103+
104+
List<Hit> hits = new ArrayList();
105+
106+
String currentQueryId = queryId;
107+
while (currentQueryId.equals(queryId) && scanner.hasNext()){
108+
BlastHitBuilder hitBuilder = new BlastHitBuilder();
109+
110+
List<Hsp> hsps = new ArrayList();
111+
112+
String currentSubjectId=subjectId;
113+
while (currentSubjectId.equals(subjectId) && scanner.hasNext()){
114+
if (new Double(evalue) > maxEScore) {
115+
line = fetchData(scanner);
116+
continue;
117+
}
118+
BlastHspBuilder hspBuilder = new BlastHspBuilder();
119+
hspBuilder
120+
.setHspAlignLen(new Integer(alnLength))
121+
.setHspGaps(new Integer(gapOpenCount))
122+
.setHspQueryFrom(new Integer(queryStart))
123+
.setHspQueryTo(new Integer(queryEnd))
124+
.setHspHitFrom(new Integer(subjectStart))
125+
.setHspHitTo(new Integer(subjectEnd))
126+
.setHspEvalue(new Double(evalue))
127+
.setHspBitScore(new Double(bitScore))
128+
.setPercentageIdentity(new Double(percIdentity)/100)
129+
.setMismatchCount(new Integer(mismatchCount));
130+
hsps.add(hspBuilder.createBlastHsp());
131+
line = fetchData(scanner);
132+
}
133+
hits.add(hitBuilder.setHsps(hsps).createBlastHit());
134+
}
135+
results.add(resultBuilder.setHits(hits).createBlastResult());
136+
} catch (NumberFormatException e) {
137+
throw new ParserException("Invalid numeric value met in:\n"+line);
138+
}
139+
}
140+
return results;
141+
}
142+
143+
private String fetchData(Scanner scanner){
144+
String line;
145+
String[] split;
146+
147+
line = scanner.nextLine();
148+
while (line.startsWith("#")){
149+
// blast tabular with header options contains some more informations
150+
if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s","");
151+
if (line.startsWith("# Query:")) queryName = line.replace("# Query: ","");
152+
if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ","");
153+
154+
// needed because blast report can end with a comment...
155+
if (!scanner.hasNext()) return null;
156+
line = scanner.nextLine();
157+
}
158+
159+
// Here, programName != null checks if there was a header in the file
160+
boolean headerFound = programName != null;
161+
162+
split = line.split("\\t");
163+
queryId =split[0];
164+
subjectId =split[1];
165+
percIdentity =split[2];
166+
alnLength =split[3];
167+
mismatchCount=split[4];
168+
gapOpenCount =split[5];
169+
queryStart =split[6];
170+
queryEnd =split[7];
171+
subjectStart =split[8];
172+
subjectEnd =split[9];
173+
evalue =split[10];
174+
bitScore =split[11];
175+
176+
// blast tabular reports only the first word of the query name.
177+
// If it was specified in the header it is better to use that definition
178+
if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) {
179+
if (queryIdMapping.get(queryId)==null) {
180+
queryIdNumber ++;
181+
queryIdMapping.put(queryId,"Query_" + queryIdNumber);
182+
}
183+
// If a complete definition of the query name was readed, than we can use
184+
// a queryID schema that is consistent with blast xml report
185+
queryId = queryIdMapping.get(queryId);
186+
}
187+
if (!headerFound) queryName = queryId;
188+
189+
return line;
190+
}
191+
192+
@Override
193+
public void storeObjects(List<Result> results) throws Exception {
194+
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
195+
}
196+
197+
/**
198+
* Intended for use with run module.
199+
* Although possible does not make a lot of sense to have it with limited
200+
* information in report
201+
* @param sequences
202+
*/
203+
@Override
204+
public void setQueryReferences(List sequences) {
205+
throw new UnsupportedOperationException("Not supported for this parser.");
206+
}
207+
/**
208+
* Intended for use with run module.
209+
* Although possible does not make a lot of sense to have it with limited
210+
* information in report
211+
* @param sequences
212+
*/
213+
@Override
214+
public void setDatabaseReferences(List sequences) {
215+
throw new UnsupportedOperationException("Not supported for this parser.");
216+
}
217+
218+
}

0 commit comments

Comments
 (0)