|
| 1 | +/* |
| 2 | + * BioJava development code |
| 3 | + * |
| 4 | + * This code may be freely distributed and modified under the |
| 5 | + * terms of the GNU Lesser General Public Licence. This should |
| 6 | + * be distributed with the code. If you do not have a copy, |
| 7 | + * see: |
| 8 | + * |
| 9 | + * http://www.gnu.org/copyleft/lesser.html |
| 10 | + * |
| 11 | + * Copyright for this code is held jointly by the individual |
| 12 | + * authors. These should be listed in @author doc comments. |
| 13 | + * |
| 14 | + * For more information on the BioJava project and its aims, |
| 15 | + * or to join the biojava-l mailing list, visit the home page |
| 16 | + * at: |
| 17 | + * |
| 18 | + * http://www.biojava.org/ |
| 19 | + * |
| 20 | + */ |
| 21 | +/* |
| 22 | + * To change this template, choose Tools | Templates |
| 23 | + * and open the template in the editor. |
| 24 | + */ |
| 25 | +package org.biojava.nbio.phylo; |
| 26 | + |
| 27 | +import org.forester.evoinference.matrix.distance.DistanceMatrix; |
| 28 | +import org.forester.phylogeny.Phylogeny; |
| 29 | +import org.forester.phylogeny.PhylogenyNode; |
| 30 | +import org.slf4j.Logger; |
| 31 | +import org.slf4j.LoggerFactory; |
| 32 | + |
| 33 | +import java.util.HashMap; |
| 34 | +import java.util.HashSet; |
| 35 | +import java.util.List; |
| 36 | +import java.util.Set; |
| 37 | + |
| 38 | +/** |
| 39 | + * Check the accuracy of a Distance Tree by least squares error (LSE) of the |
| 40 | + * Tree branch lengths and the original Distance Matrix. |
| 41 | + * |
| 42 | + * @author Scooter Willis |
| 43 | + * @author Aleix Lafita |
| 44 | + * |
| 45 | + */ |
| 46 | +public class DistanceTreeEvaluator { |
| 47 | + |
| 48 | + private static final Logger logger = LoggerFactory |
| 49 | + .getLogger(DistanceTreeEvaluator.class); |
| 50 | + |
| 51 | + /** Prevent instantiation */ |
| 52 | + private DistanceTreeEvaluator() { |
| 53 | + } |
| 54 | + |
| 55 | + /** |
| 56 | + * Evaluate the goodness of fit of a given tree to the original distance |
| 57 | + * matrix. The returned value is the coefficient of variation, i.e. the |
| 58 | + * square root of the LS error normalized by the mean. |
| 59 | + * <p> |
| 60 | + * This measure can also give an estimate of the quality of the distance |
| 61 | + * matrix, because a bad fit may mean that the distance is non-additive. |
| 62 | + * |
| 63 | + * @param tree |
| 64 | + * Phylogenetic Distance Tree to evaluate |
| 65 | + * @param matrix |
| 66 | + * Distance Matrix with the original distances |
| 67 | + * @return the square root of the average tree LS error normalized by the |
| 68 | + * average tree distance (coefficient of variation, CV). |
| 69 | + */ |
| 70 | + public static double evaluate(Phylogeny tree, DistanceMatrix matrix) { |
| 71 | + int numSequences = matrix.getSize(); |
| 72 | + List<PhylogenyNode> externalNodes = tree.getExternalNodes(); |
| 73 | + HashMap<String, PhylogenyNode> externalNodesHashMap = new HashMap<String, PhylogenyNode>(); |
| 74 | + Set<PhylogenyNode> path = new HashSet<PhylogenyNode>(); |
| 75 | + |
| 76 | + for (PhylogenyNode node : externalNodes) { |
| 77 | + externalNodesHashMap.put(node.getName(), node); |
| 78 | + } |
| 79 | + int count = 0; |
| 80 | + double averageMatrixDistance = 0.0; |
| 81 | + double averageTreeDistance = 0.0; |
| 82 | + double averageTreeErrorDistance = 0.0; |
| 83 | + for (int row = 0; row < numSequences - 1; row++) { |
| 84 | + String nodeName1 = matrix.getIdentifier(row); |
| 85 | + PhylogenyNode node1 = externalNodesHashMap.get(nodeName1); |
| 86 | + markPathToRoot(node1, path); |
| 87 | + for (int col = row + 1; col < numSequences; col++) { |
| 88 | + count++; |
| 89 | + String nodeName2 = matrix.getIdentifier(col); |
| 90 | + PhylogenyNode node2 = externalNodesHashMap.get(nodeName2); |
| 91 | + double distance = matrix.getValue(col, row); |
| 92 | + averageMatrixDistance = averageMatrixDistance + distance; |
| 93 | + PhylogenyNode commonParent = findCommonParent(node2, path); |
| 94 | + if (commonParent != null) { |
| 95 | + double treeDistance = getNodeDistance(commonParent, node1) |
| 96 | + + getNodeDistance(commonParent, node2); |
| 97 | + |
| 98 | + averageTreeDistance += treeDistance; |
| 99 | + averageTreeErrorDistance += (distance - treeDistance) |
| 100 | + * (distance - treeDistance); |
| 101 | + logger.info("{} {} Distance: {}Tree: {} difference: {}", |
| 102 | + nodeName1, nodeName2, distance, treeDistance, |
| 103 | + Math.abs(distance - treeDistance)); |
| 104 | + } else { |
| 105 | + logger.warn("Unable to find common parent with {} {}", |
| 106 | + node1, node2); |
| 107 | + } |
| 108 | + } |
| 109 | + path.clear(); |
| 110 | + } |
| 111 | + averageMatrixDistance /= count; |
| 112 | + averageTreeDistance /= count; |
| 113 | + averageTreeErrorDistance /= count; |
| 114 | + |
| 115 | + logger.info("Average matrix distance: {}", averageMatrixDistance); |
| 116 | + logger.info("Average tree distance: {}", averageTreeDistance); |
| 117 | + logger.info("Average LS error: {}", averageTreeErrorDistance); |
| 118 | + |
| 119 | + return Math.sqrt((averageTreeErrorDistance)) / (averageMatrixDistance); |
| 120 | + } |
| 121 | + |
| 122 | + private static double getNodeDistance(PhylogenyNode parentNode, |
| 123 | + PhylogenyNode childNode) { |
| 124 | + double distance = 0.0; |
| 125 | + while (childNode != parentNode) { |
| 126 | + distance = distance + childNode.getDistanceToParent(); |
| 127 | + childNode = childNode.getParent(); |
| 128 | + } |
| 129 | + |
| 130 | + return distance; |
| 131 | + } |
| 132 | + |
| 133 | + private static PhylogenyNode findCommonParent(PhylogenyNode node, |
| 134 | + Set<PhylogenyNode> path) { |
| 135 | + while (!path.contains(node)) { |
| 136 | + node = node.getParent(); |
| 137 | + } |
| 138 | + return node; |
| 139 | + } |
| 140 | + |
| 141 | + private static void markPathToRoot(PhylogenyNode node, |
| 142 | + Set<PhylogenyNode> path) { |
| 143 | + path.add(node); |
| 144 | + while (!node.isRoot()) { |
| 145 | + node = node.getParent(); |
| 146 | + path.add(node); |
| 147 | + } |
| 148 | + } |
| 149 | +} |
0 commit comments