Skip to content

Commit 4d00cd6

Browse files
author
Yana Valasatava
committed
Class to extract protein sequence from genetic positions
1 parent 4c47e34 commit 4d00cd6

1 file changed

Lines changed: 193 additions & 0 deletions

File tree

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
package org.biojava.nbio.genome.util;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
import java.util.ArrayList;
6+
import java.util.List;
7+
8+
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
9+
import org.biojava.nbio.core.sequence.DNASequence;
10+
import org.biojava.nbio.core.sequence.RNASequence;
11+
import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
12+
import org.biojava.nbio.core.sequence.template.SequenceView;
13+
import org.biojava.nbio.genome.parsers.genename.GeneChromosomePosition;
14+
import org.biojava.nbio.genome.parsers.twobit.TwoBitParser;
15+
16+
import com.google.common.collect.Range;
17+
18+
public class ChromosomeToProteinMapper {
19+
20+
private TwoBitParser parser;
21+
private String GENOME_URI;
22+
23+
/** Sets a path to the genome.2bit file.
24+
*
25+
* @param path The path to the file containing the genome in .2bit format.
26+
*/
27+
public void setGenomeURI(String path) {
28+
GENOME_URI=path;
29+
}
30+
31+
/**
32+
* Reads a genome from a locally stored .2bit file.
33+
*/
34+
public void readGenome() throws Exception {
35+
File f = new File(GENOME_URI);
36+
this.parser = new TwoBitParser(f);
37+
}
38+
39+
/** Sets a chromosome for TwoBitParser.
40+
*
41+
* @param chr The chromosome name (e.g. chr21)
42+
*/
43+
public void setChromosome(String chr) throws Exception {
44+
parser.close();
45+
String[] names = parser.getSequenceNames();
46+
for(int i=0;i<names.length;i++) {
47+
if ( names[i].equals(chr) ) {
48+
parser.setCurrentSequence(names[i]);
49+
break;
50+
}
51+
}
52+
}
53+
54+
/** Extracts the exons boundaries in CDS coordinates for genes living on the reverse DNA strand.
55+
*
56+
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
57+
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
58+
* @param cdsStart The start position of a coding region
59+
* @param cdsEnd The end position of a coding region
60+
*
61+
* @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates
62+
*/
63+
public static List<Range<Integer>> getCDSRegionsReverse(List<Integer> exonStarts, List<Integer> exonEnds,
64+
int cdsStart, int cdsEnd) {
65+
66+
// remove exons that are fully landed in UTRs
67+
List<Integer> tmpS = new ArrayList<Integer>(exonStarts);
68+
List<Integer> tmpE = new ArrayList<Integer>(exonEnds);
69+
70+
int j=0;
71+
for (int i = 0; i < tmpS.size(); i++) {
72+
if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
73+
exonStarts.remove(j);
74+
exonEnds.remove(j);
75+
}
76+
else {
77+
j++;
78+
}
79+
}
80+
81+
// remove untranslated regions from exons
82+
int nExons = exonStarts.size();
83+
exonStarts.remove(0);
84+
exonStarts.add(0, cdsStart);
85+
exonEnds.remove(nExons-1);
86+
exonEnds.add(cdsEnd);
87+
88+
List<Range<Integer>> cdsRegion = new ArrayList<Range<Integer>>();
89+
for ( int i=0; i<nExons; i++ ) {
90+
Range<Integer> r = Range.closed(exonStarts.get(i), exonEnds.get(i));
91+
cdsRegion.add(r);
92+
}
93+
return cdsRegion;
94+
}
95+
96+
/** Extracts the exons boundaries in CDS coordinates for genes living on the forward DNA strand.
97+
*
98+
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
99+
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
100+
* @param cdsStart The start position of a coding region
101+
* @param cdsEnd The end position of a coding region
102+
*
103+
* @return the list of genetic positions corresponding to the exons boundaries in CDS coordinates
104+
*/
105+
public static List<Range<Integer>> getCDSRegionsForward(List<Integer> exonStarts, List<Integer> exonEnds,
106+
int cdsStart, int cdsEnd) {
107+
108+
// remove exons that are fully landed in UTRs
109+
List<Integer> tmpS = new ArrayList<Integer>(exonStarts);
110+
List<Integer> tmpE = new ArrayList<Integer>(exonEnds);
111+
112+
int j=0;
113+
for (int i = 0; i < tmpS.size(); i++) {
114+
if ( ( tmpE.get(i) < cdsStart) || ( tmpS.get(i) > cdsEnd) ) {
115+
exonStarts.remove(j);
116+
exonEnds.remove(j);
117+
}
118+
else {
119+
j++;
120+
}
121+
}
122+
123+
// remove untranslated regions from exons
124+
int nExons = exonStarts.size();
125+
exonStarts.remove(0);
126+
exonStarts.add(0, cdsStart);
127+
exonEnds.remove(nExons-1);
128+
exonEnds.add(cdsEnd);
129+
130+
List<Range<Integer>> cdsRegion = new ArrayList<Range<Integer>>();
131+
for ( int i=0; i<nExons; i++ ) {
132+
Range<Integer> r = Range.closed(exonStarts.get(i), exonEnds.get(i));
133+
cdsRegion.add(r);
134+
}
135+
return cdsRegion;
136+
}
137+
138+
/** Extracts the DNA sequence transcribed from the input genetic coordinates.
139+
*
140+
* @param gcp The container with chromosomal positions
141+
*
142+
* @return the DNA sequence transcribed from the input genetic coordinates
143+
*/
144+
public String getTranscriptSequence(GeneChromosomePosition gcp) throws IOException, CompoundNotFoundException {
145+
return getTranscriptSequence(gcp.getExonStarts(), gcp.getExonEnds(), gcp.getCdsStart(), gcp.getCdsEnd(), gcp.getOrientation());
146+
}
147+
148+
/** Extracts the DNA sequence transcribed from the input genetic coordinates.
149+
*
150+
* @param exonStarts The list holding the genetic coordinates pointing to the start positions of the exons (including UTR regions)
151+
* @param exonEnds The list holding the genetic coordinates pointing to the end positions of the exons (including UTR regions)
152+
* @param cdsStart The start position of a coding region
153+
* @param cdsEnd The end position of a coding region
154+
* @param orientation The orientation of the strand where the gene is living
155+
*
156+
* @return the DNA sequence transcribed from the input genetic coordinates
157+
*/
158+
public String getTranscriptSequence(List<Integer> exonStarts, List<Integer> exonEnds, int codingStart, int codingEnd, Character orientation) throws IOException, CompoundNotFoundException {
159+
160+
List<Range<Integer>> cdsRegion;
161+
if (orientation.equals("-")) {
162+
cdsRegion = getCDSRegionsReverse(exonStarts, exonEnds, codingStart, codingEnd);
163+
}
164+
else {
165+
cdsRegion = getCDSRegionsForward(exonStarts, exonEnds, codingStart, codingEnd);
166+
}
167+
168+
String transcription = "";
169+
for (Range<Integer> range : cdsRegion) {
170+
int length = range.upperEndpoint() - range.lowerEndpoint();
171+
transcription += parser.loadFragment(range.lowerEndpoint(), length);
172+
}
173+
if (orientation.equals("-")) {
174+
transcription = new StringBuilder(transcription).reverse().toString();
175+
DNASequence dna = new DNASequence(transcription);
176+
SequenceView<NucleotideCompound> compliment = dna.getComplement();
177+
transcription = compliment.getSequenceAsString();
178+
}
179+
return transcription;
180+
}
181+
182+
/** Converts the DNA sequence to protein sequence.
183+
*
184+
* @param sequence the DNA sequence
185+
*
186+
* @return the protein sequence
187+
*/
188+
public String convertDNAtoProteinSequence(String sequence) throws CompoundNotFoundException {
189+
DNASequence dna = new DNASequence(sequence);
190+
RNASequence mRNA = dna.getRNASequence();
191+
return mRNA.getProteinSequence().toString();
192+
}
193+
}

0 commit comments

Comments
 (0)