1- package org .biojava .nbio .structure .secstruc ;
1+ package org .biojava .nbio .structure .basepairs ;
22
33import org .biojava .nbio .structure .*;
44import org .biojava .nbio .structure .geometry .SuperPosition ;
2121 * This module calculates the el Hassan-Calladine Base Pairing and Base-pair Step Parameters
2222 * Citation: https://www.ncbi.nlm.nih.gov/pubmed/11601858
2323 *
24- * Created by luke on 7/20/17.
24+ * The method that might be overridden is findPairs(), this implementation is used for a large-scale
25+ * analysis of the most proper helical regions in almost 4000 protein-DNA structures, almost
26+ * 2000 structures containing only DNA, or almost 1300 structures containing only RNA. (as of 7/2017).
27+ * Those who study tertiary structures for RNA folding would be better using their own method,
28+ * because this is only looking for base pairs between separate strands.
29+ *
30+ * Created by luke czapla on 7/20/17.
2531 */
2632public class BasePairParameters {
2733
2834 private static Logger log = LoggerFactory .getLogger (BasePairParameters .class );
29-
35+
36+ // See URL http://ndbserver.rutgers.edu/ndbmodule/archives/reports/tsukuba/Table1.html
37+ // and the paper cited at the top of this class (also as Table 1).
38+ // These are hard-coded to avoid problems with resource paths.
3039 private static String [] standardBases = new String [] {
3140 "SEQRES 1 A 1 A\n " +
3241 "ATOM 2 N9 A A 1 -1.291 4.498 0.000\n " +
@@ -68,17 +77,18 @@ public class BasePairParameters {
6877 "END"
6978 };
7079
71- private static String [] baseListDNA = {"A" , "G" , "T" , "C" };
72- private static String [] baseListRNA = {"A" , "G" , "U" , "C" };
80+ // this is also hard-coded data about standard WC base pairs for both DNA and RNA
81+ //private static String[] baseListDNA = {"A", "G", "T", "C"};
82+ //private static String[] baseListRNA = {"A", "G", "U", "C"};
7383 private static Map <String , Integer > map ;
7484 private static Map <Integer , List <String >> ringMap ;
7585 static {
7686 map = new HashMap <>();
7787 map .put ("DA" , 0 ); map .put ("ADE" , 0 ); map .put ("A" , 0 );
7888 map .put ("DG" , 1 ); map .put ("GUA" , 1 ); map .put ("G" , 1 );
79- map .put ("DT" , 2 ); map .put ("THY" , 2 ); map .put ("T" , 2 ); //RNA lines: map.put("U", 2); map.put("URA", 2);
89+ map .put ("DT" , 2 ); map .put ("THY" , 2 ); map .put ("T" , 2 ); map .put ("U" , 2 ); map .put ("URA" , 2 );
8090 map .put ("DC" , 3 ); map .put ("CYT" , 3 ); map .put ("C" , 3 );
81- // chemically modified bases, leaving out right now.
91+ // chemically modified bases, leaving out to ignore (to treat as gaps) right now.
8292 //map.put("DZM", 0);
8393 //map.put("UCL", 2);
8494 //map.put("2DT", 2);
@@ -91,70 +101,108 @@ public class BasePairParameters {
91101 }
92102
93103 private Structure structure ;
94- private String pairSequence = "" , pdbId = "" ;
95-
104+ private boolean useRNA = false ;
96105 private double [] pairParameters ;
106+
107+ // this is the main data that you want to get back out from the procedure.
108+ private String pairSequence = "" ;
97109 private double [][] pairingParameters ;
98110 private double [][] stepParameters ;
99111
100112
101- // Either pass the name of a PDB file (ending in .pdb) or pass the pdbId
102- public BasePairParameters (String name ) {
103- PDBFileReader pdbFileReader = new PDBFileReader ();
104- pdbFileReader .setPath ("." );
105- try {
106- if (name .contains (".pdb" )) structure = pdbFileReader .getStructure (name );
107- else structure = StructureIO .getStructure (name );
108- pdbId = name .replace (".pdb" , "" );
109- List <Chain > nucleics = this .getNucleicChains (false );
110- List <Group []> pairs = this .findPairs (nucleics );
111- pairingParameters = new double [pairs .size ()][6 ];
112- stepParameters = new double [pairs .size ()][6 ];
113- Matrix4d lastStep = null ;
114- Matrix4d currentStep = null ;
115- for (int i = 0 ; i < pairs .size (); i ++) {
116- lastStep = currentStep ;
117- currentStep = this .basePairReferenceFrame (pairs .get (i ));
118- for (int j = 0 ; j < 6 ; j ++) pairingParameters [i ][j ] = pairParameters [j ];
119- if (i != 0 ) {
120- lastStep .invert ();
121- lastStep .mul (currentStep );
122- double [] sparms = calculatetp (lastStep );
123- for (int j = 0 ; j < 6 ; j ++) stepParameters [i ][j ] = sparms [j ];
124- }
125- ; }
126- } catch (IOException |StructureException e ) {
127- log .info ("Error reading file from local drive or internet" );
128- structure = null ;
113+ /**
114+ * Constructor takes a Structure object, finds base pair and base-pair step parameters
115+ * for double-helical regions within the structure.
116+ * @param structure The already-loaded structure to analyze.
117+ * @param useRNA whether to look for canonical RNA pairs. By default (false) it analyzes DNA.
118+ * @param removeDups whether to only look for base-pair parameters for each unique sequence in
119+ * the structure (if set to <i>true</i>)
120+ */
121+ public BasePairParameters (Structure structure , boolean useRNA , boolean removeDups ) {
122+ this .structure = structure ;
123+ this .useRNA = useRNA ;
124+ if (structure == null ) {
125+ pairingParameters = null ;
126+ stepParameters = null ;
127+ return ;
129128 }
129+ List <Chain > nucleics = this .getNucleicChains (removeDups );
130+ List <Group []> pairs = this .findPairs (nucleics );
131+ pairingParameters = new double [pairs .size ()][6 ];
132+ stepParameters = new double [pairs .size ()][6 ];
133+ Matrix4d lastStep = null ;
134+ Matrix4d currentStep = null ;
135+ for (int i = 0 ; i < pairs .size (); i ++) {
136+ lastStep = currentStep ;
137+ currentStep = this .basePairReferenceFrame (pairs .get (i ));
138+ for (int j = 0 ; j < 6 ; j ++) pairingParameters [i ][j ] = pairParameters [j ];
139+ if (i != 0 ) {
140+ lastStep .invert ();
141+ lastStep .mul (currentStep );
142+ double [] sparms = calculatetp (lastStep );
143+ for (int j = 0 ; j < 6 ; j ++) stepParameters [i ][j ] = sparms [j ];
144+ }
145+ ; }
146+
130147 }
131148
149+
150+ /**
151+ * Constructor takes a Structure object, finds base pair and base-pair step parameters
152+ * for double-helical regions within the structure for only canonical DNA pairs.
153+ * @param structure The already-loaded structure to analyze.
154+ */
155+ public BasePairParameters (Structure structure ) {
156+ this (structure , false , false );
157+ }
158+
159+ /**
160+ * This reports all the pair parameters, in the order of:
161+ * buckle, propeller, opening (in degrees), shear, stagger, stretch (in Å).
162+ * @return A double[][] with length equal to number of base pairs for rows, and 6 columns
163+ */
132164 public double [][] getPairingParameters () {
133165 return pairingParameters ;
134166 }
135167
168+ /**
169+ * This reports all the base-pair step parameters, in the order of:
170+ * tilt, roll, twist (in degrees), shift, slide, rise (in Å).
171+ * @return A double[][] with length equal to number of base pairs (the first row 0 has no step
172+ * and therefore is six zeroes), and 6 columns.
173+ */
136174 public double [][] getStepParameters () {
137175 return stepParameters ;
138176 }
139177
178+ /**
179+ * This returns the primary strand's sequence where parameters were found.
180+ * There are spaces in the string anywhere there was a break in the helix or when
181+ * it goes from one helix to another helix in the structure. (the "step" is still returned!)
182+ * @return String of primary sequence with spaces between gaps and new helices.
183+ */
140184 public String getPairSequence () {
141185 return pairSequence ;
142186 }
143187
188+ /**
189+ * This reports all the nucleic acid chains and has an option to remove duplicates if you
190+ * are considering an analyze of only unique DNA or RNA helices in the Structure.
191+ * @param removeDups If true, it will ignore duplicate chains
192+ * @return A list of all the nucleic acid chains in order of the Structure
193+ */
144194 public List <Chain > getNucleicChains (boolean removeDups ) {
145195 if (structure == null ) return new ArrayList <>();
146196 List <Chain > chains = structure .getChains ();
147197 List <Chain > result = new ArrayList <>();
148198 for (Chain c : chains ) {
149- //EntityInfo ei = c.getEntityInfo();
150199 if (c .isNucleicAcid ()) {
151200 result .add (c );
152201 }
153- //result.add(c);
154202 }
155203 if (removeDups ) for (int i = 0 ; i < result .size (); i ++) {
156204 for (int j = i +2 ; j < result .size (); j ++) {
157- // remove double
205+ // remove duplicate sequences (structures with two or more identical units)
158206 if (result .get (i ).getSeqResSequence ().equals (result .get (j ).getSeqResSequence ())) {
159207 result .remove (j );
160208 }
@@ -164,12 +212,19 @@ public List<Chain> getNucleicChains(boolean removeDups) {
164212 }
165213
166214
215+ /**
216+ * This performs a search for base pairs in the structure. The criteria is alignment of
217+ * sequences and the canonical base pairs of DNA and RNA.
218+ * @param chains The list of chains already found to be nucleic acids
219+ * @return The list of corresponding Watson-Crick groups as pairs, element 0 is on the
220+ * forward strand and element 1 is on the reverse strand.
221+ */
167222 public List <Group []> findPairs (List <Chain > chains ) {
168223 List <Group []> result = new ArrayList <>();
169224 for (int i = 0 ; i < chains .size (); i ++) {
170225 Chain c = chains .get (i );
171226 for (int j = i +1 ; j < chains .size (); j ++) {
172- String complement = complement (chains .get (j ).getSeqResSequence (), false );
227+ String complement = complement (chains .get (j ).getSeqResSequence (), useRNA );
173228 String match = longestCommonSubstring (c .getSeqResSequence (), complement );
174229 //log.info(c.getSeqResSequence() + " " + chains.get(j).getSeqResSequence() + " " + match);
175230 int index1 = c .getSeqResSequence ().indexOf (match );
@@ -187,12 +242,12 @@ public List<Group[]> findPairs(List<Chain> chains) {
187242 Atom a2 = g2 .getAtom (ringMap .get (type2 ).get (0 ));
188243
189244 if (a1 == null ) {
190- log .info ("Error processing " + g1 .getPDBName () + " in " + pdbId );
245+ log .info ("Error processing " + g1 .getPDBName ());
191246 if (pairSequence .length () != 0 && pairSequence .charAt (pairSequence .length ()-1 ) != ' ' ) pairSequence += ' ' ;
192247 continue ;
193248 }
194249 if (a2 == null ) {
195- log .info ("Error processing " + g2 .getPDBName () + " in " + pdbId );
250+ log .info ("Error processing " + g2 .getPDBName ());
196251 if (pairSequence .length () != 0 && pairSequence .charAt (pairSequence .length ()-1 ) != ' ' ) pairSequence += ' ' ;
197252 continue ;
198253 }
@@ -203,7 +258,7 @@ public List<Group[]> findPairs(List<Chain> chains) {
203258 double distance = Math .sqrt (dx *dx +dy *dy +dz *dz );
204259 //log.info("C8-C6 Distance (Å): " + distance);
205260 // could be a base pair
206- if (Math .abs (distance -10.0 ) < 2.0 ) {
261+ if (Math .abs (distance -10.0 ) < 2.5 ) {
207262 boolean valid = true ;
208263 for (String atomname : ringMap .get (type1 )) {
209264 Atom a = g1 .getAtom (atomname );
@@ -232,7 +287,6 @@ public List<Group[]> findPairs(List<Chain> chains) {
232287 }
233288
234289
235-
236290 public Matrix4d basePairReferenceFrame (Group [] pair ) {
237291 Integer type1 = map .get (pair [0 ].getPDBName ());
238292 Integer type2 = map .get (pair [1 ].getPDBName ());
0 commit comments