|
| 1 | +/* |
| 2 | + * To change this license header, choose License Headers in Project Properties. |
| 3 | + * To change this template file, choose Tools | Templates |
| 4 | + * and open the template in the editor. |
| 5 | + */ |
| 6 | + |
| 7 | +package org.biojava.nbio.core.search.io.blast; |
| 8 | + |
| 9 | +import java.io.File; |
| 10 | +import java.io.FileInputStream; |
| 11 | +import java.io.FileReader; |
| 12 | +import java.io.LineNumberReader; |
| 13 | +import java.util.ArrayList; |
| 14 | +import java.util.HashMap; |
| 15 | +import java.util.LinkedHashMap; |
| 16 | +import java.util.List; |
| 17 | +import java.util.Scanner; |
| 18 | +import java.util.logging.Logger; |
| 19 | +import org.biojava.nbio.core.exceptions.ParserException; |
| 20 | +import org.biojava.nbio.core.search.io.Hit; |
| 21 | +import org.biojava.nbio.core.search.io.Hsp; |
| 22 | +import org.biojava.nbio.core.search.io.Result; |
| 23 | +import org.biojava.nbio.core.search.io.ResultFactory; |
| 24 | + |
| 25 | +/** |
| 26 | + * |
| 27 | + * @author Paolo Pavan, Genomnia srl |
| 28 | + * https://it.linkedin.com/pub/paolo-pavan/6/15a/956 |
| 29 | + */ |
| 30 | +public class BlastTabularParser implements ResultFactory { |
| 31 | + private final String blastReference = |
| 32 | + "Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14."; |
| 33 | + /** |
| 34 | + * Tries to define a different level of consistency during parsing |
| 35 | + * LITERAL is intended a strict parsing much tight to the report. |
| 36 | + * IMPROVED consistency tries to import data much tight to the data model |
| 37 | + * (I hope you got the idea, if not have a look to the code. |
| 38 | + * I'm not very sure I will leave to the user the possibility to choose) |
| 39 | + */ |
| 40 | + private enum PARSING_CONSISTENCY { |
| 41 | + IMPROVED, |
| 42 | + LITERAL |
| 43 | + } |
| 44 | + private static final Logger log = Logger.getLogger(BlastTabularParser.class.getName()); |
| 45 | + |
| 46 | + private File targetFile; |
| 47 | + private int fileLinesCount; |
| 48 | + private PARSING_CONSISTENCY parsingConsistency = PARSING_CONSISTENCY.IMPROVED; |
| 49 | + |
| 50 | + |
| 51 | + // data imported private: |
| 52 | + int queryIdNumber = 0; |
| 53 | + HashMap<String,String> queryIdMapping = new HashMap(); |
| 54 | + String programName=null, queryName = null, databaseFile = null; |
| 55 | + private String queryId ; |
| 56 | + private String subjectId ; |
| 57 | + private String percIdentity ; |
| 58 | + private String alnLength ; |
| 59 | + private String mismatchCount; |
| 60 | + private String gapOpenCount ; |
| 61 | + private String queryStart ; |
| 62 | + private String queryEnd ; |
| 63 | + private String subjectStart ; |
| 64 | + private String subjectEnd ; |
| 65 | + private String evalue ; |
| 66 | + private String bitScore ; |
| 67 | + |
| 68 | + |
| 69 | + @Override |
| 70 | + public List<String> getFileExtensions() { |
| 71 | + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. |
| 72 | + } |
| 73 | + |
| 74 | + @Override |
| 75 | + public void setFile(File f) { |
| 76 | + targetFile = f; |
| 77 | + } |
| 78 | + |
| 79 | + @Override |
| 80 | + public List<Result> createObjects(double maxEScore) throws Exception { |
| 81 | + List<Result> results = new ArrayList(); |
| 82 | + |
| 83 | + log.info("Query for hits"); |
| 84 | + LineNumberReader lnr = new LineNumberReader(new FileReader(targetFile)); |
| 85 | + lnr.skip(Long.MAX_VALUE); |
| 86 | + fileLinesCount = lnr.getLineNumber(); |
| 87 | + log.info(fileLinesCount + " hits approximately in all results"); |
| 88 | + lnr.close(); |
| 89 | + |
| 90 | + FileInputStream fileInputStream = new FileInputStream(targetFile); |
| 91 | + Scanner scanner = new Scanner(fileInputStream); |
| 92 | + |
| 93 | + String line = fetchData(scanner); |
| 94 | + while (scanner.hasNext()){ |
| 95 | + try { |
| 96 | + BlastResultBuilder resultBuilder = new BlastResultBuilder(); |
| 97 | + resultBuilder |
| 98 | + .setQueryID(queryId) |
| 99 | + .setDbFile(databaseFile) |
| 100 | + .setProgram(programName) |
| 101 | + .setQueryDef(queryName) |
| 102 | + .setReference(blastReference); |
| 103 | + |
| 104 | + List<Hit> hits = new ArrayList(); |
| 105 | + |
| 106 | + String currentQueryId = queryId; |
| 107 | + while (currentQueryId.equals(queryId) && scanner.hasNext()){ |
| 108 | + BlastHitBuilder hitBuilder = new BlastHitBuilder(); |
| 109 | + |
| 110 | + List<Hsp> hsps = new ArrayList(); |
| 111 | + |
| 112 | + String currentSubjectId=subjectId; |
| 113 | + while (currentSubjectId.equals(subjectId) && scanner.hasNext()){ |
| 114 | + if (new Double(evalue) > maxEScore) { |
| 115 | + line = fetchData(scanner); |
| 116 | + continue; |
| 117 | + } |
| 118 | + BlastHspBuilder hspBuilder = new BlastHspBuilder(); |
| 119 | + hspBuilder |
| 120 | + .setHspAlignLen(new Integer(alnLength)) |
| 121 | + .setHspGaps(new Integer(gapOpenCount)) |
| 122 | + .setHspQueryFrom(new Integer(queryStart)) |
| 123 | + .setHspQueryTo(new Integer(queryEnd)) |
| 124 | + .setHspHitFrom(new Integer(subjectStart)) |
| 125 | + .setHspHitTo(new Integer(subjectEnd)) |
| 126 | + .setHspEvalue(new Double(evalue)) |
| 127 | + .setHspBitScore(new Double(bitScore)) |
| 128 | + .setPercentageIdentity(new Double(percIdentity)/100) |
| 129 | + .setMismatchCount(new Integer(mismatchCount)); |
| 130 | + hsps.add(hspBuilder.createBlastHsp()); |
| 131 | + line = fetchData(scanner); |
| 132 | + } |
| 133 | + hits.add(hitBuilder.setHsps(hsps).createBlastHit()); |
| 134 | + } |
| 135 | + results.add(resultBuilder.setHits(hits).createBlastResult()); |
| 136 | + } catch (NumberFormatException e) { |
| 137 | + throw new ParserException("Invalid numeric value met in:\n"+line); |
| 138 | + } |
| 139 | + } |
| 140 | + return results; |
| 141 | + } |
| 142 | + |
| 143 | + private String fetchData(Scanner scanner){ |
| 144 | + String line; |
| 145 | + String[] split; |
| 146 | + |
| 147 | + line = scanner.nextLine(); |
| 148 | + while (line.startsWith("#")){ |
| 149 | + // blast tabular with header options contains some more informations |
| 150 | + if (line.matches("#\\s.?BLAST.+")) programName = line.replace("#\\s",""); |
| 151 | + if (line.startsWith("# Query:")) queryName = line.replace("# Query: ",""); |
| 152 | + if (line.startsWith("# Database:")) databaseFile = line.replace("# Database: ",""); |
| 153 | + |
| 154 | + // needed because blast report can end with a comment... |
| 155 | + if (!scanner.hasNext()) return null; |
| 156 | + line = scanner.nextLine(); |
| 157 | + } |
| 158 | + |
| 159 | + // Here, programName != null checks if there was a header in the file |
| 160 | + boolean headerFound = programName != null; |
| 161 | + |
| 162 | + split = line.split("\\t"); |
| 163 | + queryId =split[0]; |
| 164 | + subjectId =split[1]; |
| 165 | + percIdentity =split[2]; |
| 166 | + alnLength =split[3]; |
| 167 | + mismatchCount=split[4]; |
| 168 | + gapOpenCount =split[5]; |
| 169 | + queryStart =split[6]; |
| 170 | + queryEnd =split[7]; |
| 171 | + subjectStart =split[8]; |
| 172 | + subjectEnd =split[9]; |
| 173 | + evalue =split[10]; |
| 174 | + bitScore =split[11]; |
| 175 | + |
| 176 | + // blast tabular reports only the first word of the query name. |
| 177 | + // If it was specified in the header it is better to use that definition |
| 178 | + if (parsingConsistency == PARSING_CONSISTENCY.IMPROVED && headerFound) { |
| 179 | + if (queryIdMapping.get(queryId)==null) { |
| 180 | + queryIdNumber ++; |
| 181 | + queryIdMapping.put(queryId,"Query_" + queryIdNumber); |
| 182 | + } |
| 183 | + // If a complete definition of the query name was readed, than we can use |
| 184 | + // a queryID schema that is consistent with blast xml report |
| 185 | + queryId = queryIdMapping.get(queryId); |
| 186 | + } |
| 187 | + if (!headerFound) queryName = queryId; |
| 188 | + |
| 189 | + return line; |
| 190 | + } |
| 191 | + |
| 192 | + @Override |
| 193 | + public void storeObjects(List<Result> results) throws Exception { |
| 194 | + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. |
| 195 | + } |
| 196 | + |
| 197 | + /** |
| 198 | + * Intended for use with run module. |
| 199 | + * Although possible does not make a lot of sense to have it with limited |
| 200 | + * information in report |
| 201 | + * @param sequences |
| 202 | + */ |
| 203 | + @Override |
| 204 | + public void setQueryReferences(List sequences) { |
| 205 | + throw new UnsupportedOperationException("Not supported for this parser."); |
| 206 | + } |
| 207 | + /** |
| 208 | + * Intended for use with run module. |
| 209 | + * Although possible does not make a lot of sense to have it with limited |
| 210 | + * information in report |
| 211 | + * @param sequences |
| 212 | + */ |
| 213 | + @Override |
| 214 | + public void setDatabaseReferences(List sequences) { |
| 215 | + throw new UnsupportedOperationException("Not supported for this parser."); |
| 216 | + } |
| 217 | + |
| 218 | +} |
0 commit comments