diff --git a/biojava-aa-prop/pom.xml b/biojava-aa-prop/pom.xml index 89ef2154ef..1a449c7ef8 100644 --- a/biojava-aa-prop/pom.xml +++ b/biojava-aa-prop/pom.xml @@ -2,7 +2,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-aa-prop @@ -70,12 +70,12 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 org.biojava biojava-structure - 4.2.6 + 4.2.7 diff --git a/biojava-alignment/pom.xml b/biojava-alignment/pom.xml index 41389cdd4e..7dfe24c010 100644 --- a/biojava-alignment/pom.xml +++ b/biojava-alignment/pom.xml @@ -4,7 +4,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-alignment biojava-alignment @@ -46,7 +46,7 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 compile @@ -74,7 +74,7 @@ org.biojava biojava-phylo - 4.2.6 + 4.2.7 diff --git a/biojava-core/pom.xml b/biojava-core/pom.xml index e3b0e9a7a1..90e465d2dc 100644 --- a/biojava-core/pom.xml +++ b/biojava-core/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-core diff --git a/biojava-genome/pom.xml b/biojava-genome/pom.xml index b382489846..c82fade42d 100644 --- a/biojava-genome/pom.xml +++ b/biojava-genome/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-genome @@ -85,13 +85,13 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 compile org.biojava biojava-alignment - 4.2.6 + 4.2.7 compile diff --git a/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java index 07ace5278b..ea9538f333 100644 --- a/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java +++ b/biojava-genome/src/main/java/org/biojava/nbio/genome/util/ChromosomeMappingTools.java @@ -13,8 +13,11 @@ import java.util.List; /** - * A class that take care of the painful mapping + * A class that can map chromosomal positions to mRNA (coding sequence) positions. + * + * @author Andreas Prlic */ + public class ChromosomeMappingTools { private static final Logger logger = LoggerFactory.getLogger(ChromosomeMappingTools.class); @@ -915,7 +918,7 @@ private static List> getCDSExonRangesForward(GeneChromosomePositi // /** - * I have a genomic coordinate, where is it in the Gene? + * I have a genomic coordinate, where is it on the mRNA * * @param coordinate * @param chromosomePosition @@ -924,7 +927,7 @@ private static List> getCDSExonRangesForward(GeneChromosomePositi public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosomePosition chromosomePosition) { if ( chromosomePosition.getOrientation() == '+') - return getCDSPosForward(coordinate, + return getCDSPosForward(coordinate, chromosomePosition.getExonStarts(), chromosomePosition.getExonEnds(), chromosomePosition.getCdsStart(), @@ -935,301 +938,139 @@ public static int getCDSPosForChromosomeCoordinate(int coordinate, GeneChromosom chromosomePosition.getExonEnds(), chromosomePosition.getCdsStart(), chromosomePosition.getCdsEnd()); - } + /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene + * living on the forward DNA strand. + * + * @param chromPos The genetic coordinate on a chromosome + * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) + * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) + * @param cdsStart The start position of a coding region + * @param cdsEnd The end position of a coding region + * + * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) + * + * @author Yana Valasatava + */ + public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { - public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds, - int cdsStart, int cdsEnd) { - boolean inCoding = false; - int codingLength = 0; - - if (cdsEnd < cdsStart) { - int tmp = cdsEnd; - cdsEnd = cdsStart; - cdsStart = tmp; - } - - logger.debug("looking for CDS position for " +format(chromPos)); - - - if ( chromPos < cdsStart+1 ) { - // this is not in a coding region! - - logger.debug(chromPos + " < " + cdsStart+1 ); + // the genetic coordinate is not in a coding region + if ( (chromPos < (cdsStart+1) ) || ( chromPos > (cdsEnd+1) ) ) { + logger.debug("The "+format(chromPos)+" position is not in a coding region"); return -1; } - if ( chromPos > cdsEnd+1 ) { - // this is not in a coding region! + logger.debug("looking for CDS position for " +format(chromPos)); - logger.debug(chromPos + " > " + cdsEnd+1 ); - return -1; + // remove exons that are fully landed in UTRs + List tmpS = new ArrayList(exonStarts); + List tmpE = new ArrayList(exonEnds); + + int j=0; + for (int i = 0; i < tmpS.size(); i++) { + if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) { + exonStarts.remove(j); + exonEnds.remove(j); + } + else { + j++; + } } - int lengthExons = 0; - - // map reverse - for (int i = exonStarts.size() - 1; i >= 0; i--) { - - logger.debug("Reverse Exon #" + (i+1) + "/" + exonStarts.size()); - int end = exonStarts.get(i); - int start = exonEnds.get(i); - - if (end < start) { - int tmp = end; - end = start; - start = tmp; - } - lengthExons += end - start; - - - logger.debug(" is " + format(chromPos) + " part of Reverse exon? s:" + format(start+1) + " - e:" + format(end) + " | " + (end - start+1)); - logger.debug(" CDS start: " + format(cdsStart+1) + "-" + format(cdsEnd) + " coding length counter:" + codingLength); - - - - if (start+1 <= cdsEnd && end >= cdsEnd ) { - - // first exon with UTR - - inCoding = true; - - int tmpstart = start; - if (start < cdsStart) { - tmpstart = cdsStart; - } - - - logger.debug(" --- codingLength " + codingLength + - " s:" + - format(tmpstart+1) + - " e:" + - format(cdsEnd) + - " p:" + - format(chromPos) + " tmp: " + (chromPos - cdsStart)); - - logger.debug("check: " + (codingLength + cdsEnd - tmpstart+1) + " ==?? " + format(chromPos)); - - int tmp = cdsEnd - chromPos ; - // if (codingLength + cdsEnd - tmpstart >= chromPos) { - //if (end >= chromPos && start + (end-start) >= chromPos) { - // if (codingLength + cdsEnd - tmpstart >= chromPos) { - if ( chromPos >= start +1 && chromPos <= end){ - - - logger.debug(" -> found position in UTR exon: P: " + format(chromPos) + " s:" + format(tmpstart+1) + " l:" + format(tmp) + " cdsS:" + format(cdsStart+1) + " cdsE:" + format(cdsEnd) + " codingL:" + codingLength); - return codingLength + tmp; - } - - - logger.debug(" codinglength " + codingLength + " + " + (cdsEnd - tmpstart ) ); - - // do not add 1 here - codingLength += (cdsEnd - tmpstart ); - - boolean debug = logger.isDebugEnabled(); - - if (debug) { - StringBuffer b = new StringBuffer(); - b.append(" UTR :" + format(cdsEnd + 1) + " - " + format(end) + newline); - if (tmpstart == start) - b.append(" -> "); - else - b.append(" <-> "); - b.append("Reverse Exon :" + format(tmpstart+1) + " - " + (cdsEnd) + " | " + format(cdsEnd - tmpstart) + " - " + codingLength + " | " + (codingLength % 3) + newline); - - logger.debug(b.toString()); - - // single exon with UTR on both ends - if (tmpstart != start) - logger.debug(" UTR :" + format(cdsStart - 1) + " - " + format(start)); - } - } else if (start <= cdsStart && end >= cdsStart) { - - // terminal exon with UTR - inCoding = false; - - - logger.debug(format(start + codingLength + end - cdsStart) + " ?? " + format(chromPos)); - // (start + codingLength + end - cdsStart >= chromPos && - if (( start+1 <= chromPos) && ( end >= chromPos)) { - - //int tmp = end - cdsStart ; -// int tmp = chromPos - cdsStart ; -// int l = end - cdsStart; - int tmp = end-chromPos ; - if ( tmp > end -cdsStart) { - tmp = end-cdsStart ; - - logger.debug("Adjust tmp to " + tmp); - } - - - - logger.debug( codingLength + " | " + (end -chromPos) + " | " + (end - cdsStart) ); - logger.debug(" <- Exon : " + format(cdsStart) + " - " + format(end) + " | " + format(end - cdsStart +1) + " | "); - logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); - logger.debug(" <- YYY found position noncoding exon: #" + (i+1) + " " + format(chromPos) + " s:" + format(start) + " tmp: " + format(tmp) + " cdsStart" + format(cdsStart) + " cdl:" + codingLength + " " + format(cdsEnd)); - - return codingLength + tmp; - } - - - logger.debug(" codinglength " + codingLength + " + " + (end - cdsStart) ); - codingLength += (end - cdsStart+1); - - logger.debug(" <- Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); - logger.debug(" UTR : " + format(start+1) + " - " + format(cdsStart )); - - } else if (inCoding) { - // standard coding exon - // if (codingLength + end - start >= chromPos) { - if ( chromPos >= start+1 && chromPos <= end) { - - int tmp = end -chromPos ; - if ( tmp > (end-start+1)) { - - tmp = (end - start+1); - - logger.debug("Adjusting tmp to " + tmp); - } - - - logger.debug(" -> found position in reverse coding exon: #" + (i+1) + " chromPos:" + format(chromPos) + " start:" + format(start+1) + " end:" + format(end) + " tmp:" + tmp + " cdsStart:" + cdsStart + " codingLength:" + codingLength); - - return codingLength+tmp; - } - - // full exon is coding - - logger.debug(" codinglength " + codingLength + " + " + (end - start) ); - // don't add 1 - codingLength += (end - start); - - logger.debug(" Exon : " + format(start+1) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); - } else { - // e.g. see UBQLN3 + // remove untranslated regions from exons + int nExons = exonStarts.size(); + exonStarts.remove(0); + exonStarts.add(0, cdsStart); + exonEnds.remove(nExons-1); + exonEnds.add(cdsEnd); - logger.debug(" no translation! cdl:" + codingLength); - - } + int codingLength = 0; + int lengthExon = 0; - //if ( inCoding ) - // logger.debug(" exon phase at end:" + ((codingLength) % 3)); + // map the genetic coordinate on a stretch of a reverse strand + for (int i = 0; i < nExons; i++) { - logger.debug(" coding length: " + codingLength + "(phase:" + (codingLength % 3) + ") CDS POS trying to map:" + chromPos); + int start = exonStarts.get(i); + int end = exonEnds.get(i); + lengthExon = end - start; + if (start+1 <= chromPos && end >= chromPos ) { + return codingLength + (chromPos-start); + } + else { + codingLength += lengthExon; + } } - - logger.debug("length exons: " + lengthExons); - // could not map, or map over the full length?? - - return -1; - } - /** - * Get the chromosome position mapped onto the mrna CDS transcript position (needs to be divided by 3 to get protein coordinate) + /** Converts the genetic coordinate to the position of the nucleotide on the mRNA sequence for a gene + * living on the reverse DNA strand. + * + * @param chromPos The genetic coordinate on a chromosome + * @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions) + * @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions) + * @param cdsStart The start position of a coding region + * @param cdsEnd The end position of a coding region * - * @param exonStarts - * @param exonEnds - * @param cdsStart - * @param cdsEnd - * @return - */ - public static int getCDSPosForward(int chromPos, List exonStarts, List exonEnds, - int cdsStart, int cdsEnd) { - boolean inCoding = false; - int codingLength = 0; - - - logger.debug("looking for CDS position for " +chromPos); - - int lengthExons = 0; - // map forward - for (int i = 0; i < exonStarts.size(); i++) { - - - // start can include UTR - int start = exonStarts.get(i); - int end = exonEnds.get(i); - - lengthExons += end - start; - - - logger.debug("forward exon: " + (start+1) + " - " + end + " | " + (end - start) + " ? overlaps with " + format(chromPos)); - - if (start +1 <= cdsStart +1 && end >= cdsStart+1) { - - if (end >= chromPos) { - // we are reaching our target position - // -1 is important here... - int tmp = chromPos - cdsStart -1; - - - logger.debug("cdl:" + codingLength + " | " + tmp); - logger.debug(" -> found position in UTR exon: " + chromPos + " " + format(start) + " " + format(tmp) + " " + cdsStart + " " + codingLength); - - return codingLength + tmp; - } - inCoding = true; - codingLength += (end - cdsStart); - - logger.debug(" UTR : " + format(start) + " - " + (cdsStart )); - logger.debug(" -> Exon : " + format(cdsStart+1) + " - " + format(end) + " | " + format(end - cdsStart) + " | " + codingLength + " | " + (codingLength % 3)); - - } else if (start <= cdsEnd && end >= cdsEnd) { - //logger.debug(" <-- CDS end at: " + cdsEnd ); - inCoding = false; - if (cdsEnd >= chromPos && (start +1 <= chromPos)) { - int tmp = chromPos - start -1 ; - - - logger.debug(" -> cdsForward found position in non coding exon#"+i+": " + chromPos + " " + format(start+1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); - return codingLength + tmp ; - } - codingLength += (cdsEnd - start); - - logger.debug(" <- Exon : " + format(start+1) + " - " + format(cdsEnd) + " | " + format(cdsEnd - start+1) + " | " + codingLength + " | " + (codingLength % 3)); - logger.debug(" UTR : " + format(cdsEnd + 1) + " - " + format(end)); - - - } else if (inCoding) { - - if (end >= chromPos && (start +1 <=chromPos)) { - - int tmp = chromPos-start-1 ; - - - logger.debug(codingLength + " | " + tmp); - logger.debug(" -> found position in coding exon #" + (i + 1) + ": " + format(chromPos) + " " + format(start + 1) + " " + format(tmp) + " " + cdsStart + " " + codingLength); - - - return codingLength + tmp ; - } - // full exon is coding - codingLength += (end - start); + * @return the position of the nucleotide base on the mRNA sequence corresponding to the input genetic coordinate (base 1) + * + * @author Yana Valasatava + */ + public static int getCDSPosReverse(int chromPos, List exonStarts, List exonEnds, + int cdsStart, int cdsEnd) { - logger.debug(" Exon :" + format(start) + " - " + format(end) + " | " + format(end - start) + " | " + codingLength + " | " + (codingLength % 3)); - } - // if ( inCoding ) - // logger.debug("exon phase at end:" + (codingLength % 3)); - // - // logger.debug(" coding length: " + codingLength); + // the genetic coordinate is not in a coding region + if ( (chromPos < (cdsStart+1)) || ( chromPos > (cdsEnd+1) ) ) { + logger.debug("The "+format(chromPos)+" position is not in a coding region"); + return -1; + } + logger.debug("looking for CDS position for " +format(chromPos)); + // remove exons that are fully landed in UTRs + List tmpS = new ArrayList(exonStarts); + List tmpE = new ArrayList(exonEnds); + + int j=0; + for (int i = tmpS.size() - 1; i >= 0; i--) { + if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) { + exonStarts.remove(j); + exonEnds.remove(j); + } + else { + j++; + } } - //logger.debug("length exons: " + lengthExons); - //return codingLength - 3; - - // could not map! + // remove untranslated regions from exons + int nExons = exonStarts.size(); + exonStarts.remove(0); + exonStarts.add(0, cdsStart); + exonEnds.remove(nExons-1); + exonEnds.add(cdsEnd); + int codingLength = 0; + int lengthExon = 0; + + // map the genetic coordinate on a stretch of a reverse strand + for (int i = nExons - 1; i >= 0; i--) { + + int start = exonStarts.get(i); + int end = exonEnds.get(i); + + lengthExon = end - start; + // +1 offset to be a base 1 + if (start+1 <= chromPos && end >= chromPos ) { + return codingLength + (end-chromPos+1); + } + else { + codingLength += lengthExon; + } + } return -1; } - - } diff --git a/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java b/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java index b4615b9db2..7f9ded778a 100644 --- a/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java +++ b/biojava-genome/src/test/java/org/biojava/nbio/genome/TestGenomeMapping.java @@ -10,142 +10,306 @@ import java.io.InputStream; import java.net.URL; +import java.util.Arrays; import java.util.List; import java.util.zip.GZIPInputStream; /** * Created by andreas on 7/19/16. */ -public class TestGenomeMapping extends TestCase{ +public class TestGenomeMapping extends TestCase { - private static final String geneChromosomeFile = "http://cdn.rcsb.org/gene/hg38/geneChromosome38.tsf.gz"; + private static final String geneChromosomeFile = "http://cdn.rcsb.org/gene/hg38/geneChromosome38.tsf.gz"; - private List gcps = null; + private List gcps = null; - @Override - protected void setUp() throws Exception { - super.setUp(); - InputStream input = new GZIPInputStream(new URL(geneChromosomeFile).openStream()); - gcps = GeneChromosomePositionParser.getChromosomeMappings(input); + @Override + protected void setUp() throws Exception { + super.setUp(); + InputStream input = new GZIPInputStream(new URL(geneChromosomeFile).openStream()); + gcps = GeneChromosomePositionParser.getChromosomeMappings(input); + } + @Test + public void testAK1() { + String geneName = "AK1"; - } + assertNotNull(gcps); + assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); + int uniProtLength = 194; - @Test - public void testAK1() { - String geneName = "AK1"; + try { - assertNotNull(gcps); - assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); + for (GeneChromosomePosition pos : gcps) { - int uniProtLength = 194; + //System.out.println(pos.getGeneName()); + if (!pos.getGeneName().equals(geneName)) + continue; - try { + /// there are three alternative transcripts for AK1. + // we are just testing one here: - for (GeneChromosomePosition pos : gcps) { + if ( ! pos.getGenebankId().equals("NM_000476")) + continue; - //System.out.println(pos.getGeneName()); - if (!pos.getGeneName().equals(geneName)) - continue; + assertTrue(pos.getGeneName().equals(geneName)); + assertTrue(pos.getOrientation().equals('-')); + assertTrue(pos.getChromosome().equals("chr9")); - /// there are three alternative transcripts for AK1. - // we are just testing one here: + List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); - if ( ! pos.getGenebankId().equals("NM_000476")) - continue; + validateExon(0,0,7, cdsranges ); + validateExon(1,7,43, cdsranges ); + validateExon(2,43,207, cdsranges ); + validateExon(3,207,324, cdsranges ); + validateExon(4,324,516, cdsranges ); + validateExon(5,516,585, cdsranges ); - assertTrue(pos.getGeneName().equals(geneName)); - assertTrue(pos.getOrientation().equals('-')); - assertTrue(pos.getChromosome().equals("chr9")); - List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); + int cdslength = ChromosomeMappingTools.getCDSLength(pos); - validateExon(0,0,7, cdsranges ); - validateExon(1,7,43, cdsranges ); - validateExon(2,43,207, cdsranges ); - validateExon(3,207,324, cdsranges ); - validateExon(4,324,516, cdsranges ); - validateExon(5,516,585, cdsranges ); + assertTrue("CDS length should be 582, but is " + cdslength, cdslength == (uniProtLength *3)); + List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); - int cdslength = ChromosomeMappingTools.getCDSLength(pos); + // we are reverse strand. reverse the order + chromranges = Lists.reverse(chromranges); - assertTrue("CDS length should be 582, but is " + cdslength, cdslength == (uniProtLength *3)); + assertTrue(chromranges.size() == 6); - List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); + // compare with https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS6881 + validateExon(0,127868008,127868076, chromranges ); + validateExon(1,127868320,127868512, chromranges ); + validateExon(2,127871822,127871939, chromranges ); + validateExon(3,127872689,127872853, chromranges ); + validateExon(4,127873025,127873061, chromranges ); + validateExon(5,127874610,127874617, chromranges ); - // we are reverse strand. reverse the order - chromranges = Lists.reverse(chromranges); + } + } catch (Exception e) { + fail(e.getMessage()); + } + } - assertTrue(chromranges.size() == 6); + @Test + public void testHBA(){ - // compare with https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS6881 - validateExon(0,127868008,127868076, chromranges ); - validateExon(1,127868320,127868512, chromranges ); - validateExon(2,127871822,127871939, chromranges ); - validateExon(3,127872689,127872853, chromranges ); - validateExon(4,127873025,127873061, chromranges ); - validateExon(5,127874610,127874617, chromranges ); - - } - } catch (Exception e) { - fail(e.getMessage()); - } - } - - @Test - public void testHBA(){ - - String geneName = "HBA1"; - assertNotNull(gcps); - - assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); - - try { - - for ( GeneChromosomePosition pos : gcps){ - - //System.out.println(pos.getGeneName()); - if ( ! pos.getGeneName().equals(geneName)) - continue; - - assertTrue(pos.getGeneName().equals("HBA1")); - assertTrue(pos.getGenebankId().equals("NM_000558")); - assertTrue(pos.getChromosome().equals("chr16")); - assertTrue(pos.getTranscriptionStart().equals(176650)); - assertTrue(pos.getTranscriptionEnd().equals(177522)); - assertTrue(pos.getOrientation().equals('+')); - - List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); - - assertTrue(cdsranges.size() == 3); - - validateExon(0,0,95,cdsranges); - validateExon(1,95,300,cdsranges); - validateExon(2,300,429,cdsranges); - - - List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); - - validateExon(0,176716,176811, chromranges ); - validateExon(1,176928,177133, chromranges ); - validateExon(2,177282,177411, chromranges ); - - - } - } catch (Exception e){ - fail(e.getMessage()); - } - - - } - - private void validateExon(int exonNr, int start, int stop, List> cdsranges) { - - Range exon = cdsranges.get(exonNr); - assertTrue("Exon " + exonNr + " boundary "+ exon.lowerEndpoint() + " does not match " +start , exon.lowerEndpoint().equals(start)); - assertTrue("Exon " + exonNr + " boundary " + exon.upperEndpoint() + " does not match " + stop, exon.upperEndpoint().equals(stop)); - - } + String geneName = "HBA1"; + assertNotNull(gcps); + + assertTrue("Problems with downloading refFlat file from UCSC browser ", gcps.size() > 100); + + try { + + for ( GeneChromosomePosition pos : gcps){ + + //System.out.println(pos.getGeneName()); + if ( ! pos.getGeneName().equals(geneName)) + continue; + + assertTrue(pos.getGeneName().equals("HBA1")); + assertTrue(pos.getGenebankId().equals("NM_000558")); + assertTrue(pos.getChromosome().equals("chr16")); + assertTrue(pos.getTranscriptionStart().equals(176650)); + assertTrue(pos.getTranscriptionEnd().equals(177522)); + assertTrue(pos.getOrientation().equals('+')); + + List> cdsranges = ChromosomeMappingTools.getCDSExonRanges(pos); + + assertTrue(cdsranges.size() == 3); + + validateExon(0,0,95,cdsranges); + validateExon(1,95,300,cdsranges); + validateExon(2,300,429,cdsranges); + + + List> chromranges = ChromosomeMappingTools.getChromosomalRangesForCDS(pos); + + validateExon(0,176716,176811, chromranges ); + validateExon(1,176928,177133, chromranges ); + validateExon(2,177282,177411, chromranges ); + + + } + } catch (Exception e){ + fail(e.getMessage()); + } + + + } + + private void validateExon(int exonNr, int start, int stop, List> cdsranges) { + + Range exon = cdsranges.get(exonNr); + assertTrue("Exon " + exonNr + " boundary "+ exon.lowerEndpoint() + " does not match " +start , exon.lowerEndpoint().equals(start)); + assertTrue("Exon " + exonNr + " boundary " + exon.upperEndpoint() + " does not match " + stop, exon.upperEndpoint().equals(stop)); + + } + + /** Get the position of the nucleotide base corresponding to the position of that base on the mRNA sequence + * for a gene living on the reverse DNA strand. + * + * @author Yana Valasatava + */ + private int getPositionInmRNA(String geneName, String genebankId, int posChrom) { + for (GeneChromosomePosition gcp : gcps) { + if ( gcp.getGeneName().equals(geneName) ) { + if ( gcp.getGenebankId().equals(genebankId) ) { + return ChromosomeMappingTools.getCDSPosForChromosomeCoordinate(posChrom, gcp); + } + } + } + return -1; + } + + /** Make sure the mapping tool correctly retrieves the mRNA position for a gene + * living on the forward DNA strand for different chromosome positions. + * + * @author Yana Valasatava + */ + @Test + public void testForwardMappingPositions() { + + String geneName = "HORMAD2"; // gene on the forward DNA strand + String genebankId = "NM_152510"; // GeneBank ID for the transcript used for testing (ENST00000336726) + + List scenarios = Arrays.asList("first1exon", "last1exon", "last3exon"); + + int cds; + int posExonStart; + int posInmRNA; + for (String scenario : scenarios) { + + switch (scenario) { + + case "first1exon": + posExonStart = 30093953; // ending position of the last exon coding region (on forward strand) + posInmRNA = 1; // base 1 position in mRNA sequence + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + + case "last1exon": + posExonStart = 30094003; // starting position of the last exon coding region (on forward strand) + posInmRNA = 51; // position in mRNA sequence equals to the length of the exon + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + + case "last3exon": + posExonStart = 30103500; // starting position of the first base in a coding region (3rd exon) + posInmRNA = 257; // position in mRNA sequence equals to the sum length of the 3 last exons + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + } + } + } + + /** Make sure the mapping tool correctly retrieves the mRNA position for a gene + * living on the reverse DNA strand for different chromosome positions. + * + * @author Yana Valasatava + */ + @Test + public void testReverseMappingPositions() { + + String geneName = "BCL11B"; // gene on the reverse DNA strand + String genebankId = "NM_138576"; // GeneBank ID for the transcript used for testing (ENST00000357195) + + List scenarios = Arrays.asList("first1exon", "last1exon", "last3exon"); + + int cds; + int posExonStart; + int posInmRNA; + for (String scenario : scenarios) { + + switch (scenario) { + + case "first1exon": + posExonStart = 99271218; // ending position of the last exon coding region (on forward strand) + posInmRNA = 1; // base 1 position in mRNA sequence + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + + case "last1exon": + posExonStart = 99271161; // starting position of the last exon coding region (on forward strand) + posInmRNA = 58; // position in mRNA sequence equals to the length of the exon + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + + case "last3exon": + posExonStart = 99231345; // starting position of the first base in a coding region (3rd exon) + posInmRNA = 640; // position in mRNA sequence equals to the sum length of the 3 last exons + cds = getPositionInmRNA(geneName, genebankId, posExonStart); + assertEquals(cds, posInmRNA); + break; + } + } + } + + /** Test to make sure the mapping tool correctly identify that position falls outside the coding region + * for a gene living on the forward DNA strand. + * + * @author Yana Valasatava + */ + @Test + public void testForwardMappingForExonBoundaries() { + + String geneName = "HBA1"; // gene on the reverse DNA strand + String genebankId = "NM_000558"; // GeneBank ID for the transcript used for testing (ENST00000320868) + + int posExonStart = 176717; // starting position of the first base in a coding region (1st exon) + int posExonEnd = 176811; // ending position of the first base in a coding region (1st exon) + + int cdsSE = getPositionInmRNA(geneName, genebankId, posExonStart-1); + assertEquals(cdsSE, -1); + + int cdsEE = getPositionInmRNA(geneName, genebankId, posExonEnd+1); + assertEquals(cdsEE, -1); + } + + /** Test to make sure the mapping tool correctly identify that position falls outside the coding region + * for a gene living on the reverse DNA strand. + * + * @author Yana Valasatava + */ + @Test + public void testReverseMappingForExonBoundaries() { + + String geneName = "BCL11B"; // gene on the reverse DNA strand + String genebankId = "NM_138576"; // GeneBank ID for the transcript used for testing (ENST00000357195) + + int posExonStart = 99174151; // starting position of the first base in a coding region (1st exon) + int posExonEnd = 99176195; // ending position of the first base in a coding region (1st exon) + + int cdsSE = getPositionInmRNA(geneName, genebankId, posExonStart-1); + assertEquals(cdsSE, -1); + + int cdsEE = getPositionInmRNA(geneName, genebankId, posExonEnd+1); + assertEquals(cdsEE, -1); + } + + /** Test to make sure the mapping tool correctly converts the genetic position to a position on mRNA + * when multiple UTR regions are consecutive. + * + * @author Yana Valasatava + */ + @Test + public void testMappingCromosomePosTomRNAMultiUTRs() { + + String geneName = "ILK"; // gene on the reverse DNA strand + String genebankId = "NM_001278442"; // GeneBank ID for the transcript used for testing (ENST00000532063) + + int chromPos = 6608760; + int mRNAPos = 16; + + int cds = getPositionInmRNA(geneName, genebankId, chromPos); + assertEquals(cds, mRNAPos); + + } } + diff --git a/biojava-integrationtest/pom.xml b/biojava-integrationtest/pom.xml index dca85357b3..c52783d2a1 100644 --- a/biojava-integrationtest/pom.xml +++ b/biojava-integrationtest/pom.xml @@ -4,7 +4,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-integrationtest jar @@ -32,7 +32,7 @@ org.biojava biojava-structure - 4.2.6 + 4.2.7 diff --git a/biojava-modfinder/pom.xml b/biojava-modfinder/pom.xml index e969b11f09..0ca1247f39 100644 --- a/biojava-modfinder/pom.xml +++ b/biojava-modfinder/pom.xml @@ -4,7 +4,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-modfinder biojava-modfinder @@ -31,7 +31,7 @@ org.biojava biojava-structure - 4.2.6 + 4.2.7 jar compile diff --git a/biojava-ontology/pom.xml b/biojava-ontology/pom.xml index 305039b548..03b1c00b76 100644 --- a/biojava-ontology/pom.xml +++ b/biojava-ontology/pom.xml @@ -4,7 +4,7 @@ org.biojava biojava - 4.2.6 + 4.2.7 biojava-ontology diff --git a/biojava-phylo/pom.xml b/biojava-phylo/pom.xml index bc55ac6307..d374a993fd 100644 --- a/biojava-phylo/pom.xml +++ b/biojava-phylo/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-phylo @@ -44,7 +44,7 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 compile diff --git a/biojava-protein-disorder/pom.xml b/biojava-protein-disorder/pom.xml index b95632bc2c..73c642d3e1 100644 --- a/biojava-protein-disorder/pom.xml +++ b/biojava-protein-disorder/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-protein-disorder jar @@ -63,7 +63,7 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 diff --git a/biojava-sequencing/pom.xml b/biojava-sequencing/pom.xml index 0990a8e9b2..0b4d66e069 100644 --- a/biojava-sequencing/pom.xml +++ b/biojava-sequencing/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-sequencing @@ -47,7 +47,7 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 compile diff --git a/biojava-structure-gui/pom.xml b/biojava-structure-gui/pom.xml index f96aad2a97..cfd684d103 100644 --- a/biojava-structure-gui/pom.xml +++ b/biojava-structure-gui/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 4.0.0 biojava-structure-gui @@ -25,13 +25,13 @@ org.biojava biojava-structure - 4.2.6 + 4.2.7 compile org.biojava biojava-core - 4.2.6 + 4.2.7 compile diff --git a/biojava-structure/pom.xml b/biojava-structure/pom.xml index ac6de4757a..db1c41a4f1 100644 --- a/biojava-structure/pom.xml +++ b/biojava-structure/pom.xml @@ -4,7 +4,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-structure biojava-structure @@ -22,13 +22,13 @@ org.biojava biojava-alignment - 4.2.6 + 4.2.7 compile org.biojava biojava-core - 4.2.6 + 4.2.7 compile diff --git a/biojava-survival/pom.xml b/biojava-survival/pom.xml index 1f03bad9be..e6c0294b34 100644 --- a/biojava-survival/pom.xml +++ b/biojava-survival/pom.xml @@ -4,7 +4,7 @@ org.biojava biojava - 4.2.6 + 4.2.7 biojava-survival diff --git a/biojava-ws/pom.xml b/biojava-ws/pom.xml index 646bd14a0b..0f2e83e414 100644 --- a/biojava-ws/pom.xml +++ b/biojava-ws/pom.xml @@ -3,7 +3,7 @@ biojava org.biojava - 4.2.6 + 4.2.7 biojava-ws biojava-ws @@ -19,7 +19,7 @@ org.biojava biojava-core - 4.2.6 + 4.2.7 compile diff --git a/biojava-ws/src/main/java/org/biojava/nbio/ws/hmmer/RemoteHmmerScan.java b/biojava-ws/src/main/java/org/biojava/nbio/ws/hmmer/RemoteHmmerScan.java index 4e83b61ec8..586aad785f 100644 --- a/biojava-ws/src/main/java/org/biojava/nbio/ws/hmmer/RemoteHmmerScan.java +++ b/biojava-ws/src/main/java/org/biojava/nbio/ws/hmmer/RemoteHmmerScan.java @@ -23,6 +23,8 @@ import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.biojava.nbio.core.sequence.ProteinSequence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.*; import java.net.HttpURLConnection; @@ -31,14 +33,17 @@ import java.util.TreeSet; -/** Makes remote calls to the HMMER web service at the EBI web site and returns Pfam domain annotations for an input protein sequence. +/** + * Makes remote calls to the HMMER web service at the EBI web site and returns Pfam domain annotations for an input protein sequence. * * @author Andreas Prlic * @since 3.0.3 */ public class RemoteHmmerScan implements HmmerScan { - public static String HMMER_SERVICE = "http://www.ebi.ac.uk/Tools/hmmer/search/hmmscan"; + private static final Logger LOGGER = LoggerFactory.getLogger(RemoteHmmerScan.class); + + public static final String HMMER_SERVICE = "http://www.ebi.ac.uk/Tools/hmmer/search/hmmscan"; public RemoteHmmerScan(){ @@ -54,7 +59,8 @@ public SortedSet scan(ProteinSequence sequence) throws IOException } - /** Scans a protein sequence for Pfam profile matches. + /** + * Scans a protein sequence for Pfam profile matches. * * @param sequence * @param serviceLocation @@ -68,7 +74,7 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation postContent.append("hmmdb=pfam"); - // by default hmmscan runs with the HMMER3 cut_ga parameter enabled, the "gathering freshold", which depends on + // by default hmmscan runs with the HMMER3 cut_ga parameter enabled, the "gathering threshold", which depends on // the cutoffs defined in the underlying HMM files. // to request a different cutoff by e-value this could be enabled: //postContent.append("&E=1"); @@ -86,7 +92,7 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation connection.setRequestMethod("POST"); connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); - connection.setRequestProperty("Accept:","application/json"); + connection.setRequestProperty("Accept","application/json"); connection.setRequestProperty("Content-Length", "" + Integer.toString(postContent.toString().getBytes().length)); @@ -99,13 +105,12 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation wr.close (); -// //Now get the redirect URL + //Now get the redirect URL URL respUrl = new URL( connection.getHeaderField( "Location" )); int responseCode = connection.getResponseCode(); if ( responseCode == 500){ - System.err.println("something went wrong!" + serviceLocation); - System.err.println(connection.getResponseMessage()); + LOGGER.warn("Got 500 response code for URL {}. Response message: {}.", serviceLocation, connection.getResponseMessage()); } HttpURLConnection connection2 = (HttpURLConnection) respUrl.openConnection(); @@ -122,7 +127,6 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation StringBuffer result = new StringBuffer(); while ((inputLine = in.readLine()) != null) { - //System.out.println(inputLine); result.append(inputLine); } @@ -141,7 +145,6 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation for(int i =0 ; i < hits.size() ; i++){ JSONObject hit = hits.getJSONObject(i); - //System.out.println("hit: "+ hit); HmmerResult hmmResult = new HmmerResult(); @@ -170,13 +173,8 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation SortedSet domains = new TreeSet(); for ( int j= 0 ; j < hmmdomains.size() ; j++){ JSONObject d = hmmdomains.getJSONObject(j); - //System.out.println(d); Integer is_included = getInteger(d.get("is_included")); if ( is_included == 0) { -// System.out.println(" excluding: " + d.get("alihmmdesc") + " " + d.get("alihmmname") + " " + -// hit.get("evalue") + " " + -// d.get("alisqfrom") + " " + -// d.get("alisqto")); continue; } @@ -184,21 +182,12 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation // this filters out multiple hits to the same clan Integer outcompeted = getInteger(d.get("outcompeted")); if ( outcompeted != null && outcompeted == 1) { -// System.out.println(" outcompeted: " + d.get("alihmmdesc") + " " + d.get("alihmmname")+ " " + -// hit.get("evalue") + " " + -// d.get("alisqfrom") + " " + -// d.get("alisqto") -// ); continue; } Integer significant = getInteger(d.get("significant")); if ( significant != 1) { -// System.out.println(" not significant: " + d.get("alihmmdesc") + " " + d.get("alihmmname")+ " " + -// hit.get("evalue") + " " + -// d.get("alisqfrom") + " " + -// d.get("alisqto")); continue; } @@ -224,8 +213,8 @@ public SortedSet scan(ProteinSequence sequence, URL serviceLocation results.add(hmmResult); } - } catch (Exception e){ - e.printStackTrace(); + } catch (NumberFormatException e){ + LOGGER.warn("Could not parse number in Hmmer web service json response: {}", e.getMessage()); } return results; diff --git a/pom.xml b/pom.xml index 3993929e35..6513c4cb61 100644 --- a/pom.xml +++ b/pom.xml @@ -12,7 +12,7 @@ org.biojava biojava pom - 4.2.6 + 4.2.7 biojava BioJava is an open-source project dedicated to providing a Java framework for processing biological data. It provides analytical and statistical routines, parsers for common file formats and allows the @@ -44,7 +44,7 @@ scm:git:git@github.com:biojava/biojava.git https://github.com/biojava/biojava - biojava-4.2.6 + biojava-4.2.7