//---------------------------------------------------------------------- // File: kd_split.cpp // Programmer: Sunil Arya and David Mount // Description: Methods for splitting kd-trees // Last modified: 01/04/05 (Version 1.0) //---------------------------------------------------------------------- // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and // David Mount. All Rights Reserved. // // This software and related documentation is part of the Approximate // Nearest Neighbor Library (ANN). This software is provided under // the provisions of the Lesser GNU Public License (LGPL). See the // file ../ReadMe.txt for further information. // // The University of Maryland (U.M.) and the authors make no // representations about the suitability or fitness of this software for // any purpose. It is provided "as is" without express or implied // warranty. //---------------------------------------------------------------------- // History: // Revision 0.1 03/04/98 // Initial release // Revision 1.0 04/01/05 //---------------------------------------------------------------------- #include "kd_tree.hpp" // kd-tree definitions #include "kd_util.hpp" // kd-tree utilities #include "kd_split.hpp" // splitting functions //---------------------------------------------------------------------- // Constants //---------------------------------------------------------------------- const double ERR = 0.001; // a small value const double FS_ASPECT_RATIO = 3.0; // maximum allowed aspect ratio // in fair split. Must be >= 2. //---------------------------------------------------------------------- // kd_split - Bentley's standard splitting routine for kd-trees // Find the dimension of the greatest spread, and split // just before the median point along this dimension. //---------------------------------------------------------------------- void kd_split( ANNpointArray pa, // point array (permuted on return) ANNidxArray pidx, // point indices const ANNorthRect &bnds, // bounding rectangle for cell int n, // number of points int dim, // dimension of space int &cut_dim, // cutting dimension (returned) ANNcoord &cut_val, // cutting value (returned) int &n_lo) // num of points on low side (returned) { // find dimension of maximum spread cut_dim = annMaxSpread(pa, pidx, n, dim); n_lo = n/2; // median rank // split about median annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); } //---------------------------------------------------------------------- // midpt_split - midpoint splitting rule for box-decomposition trees // // This is the simplest splitting rule that guarantees boxes // of bounded aspect ratio. It simply cuts the box with the // longest side through its midpoint. If there are ties, it // selects the dimension with the maximum point spread. // // WARNING: This routine (while simple) doesn't seem to work // well in practice in high dimensions, because it tends to // generate a large number of trivial and/or unbalanced splits. // Either kd_split(), sl_midpt_split(), or fair_split() are // recommended, instead. //---------------------------------------------------------------------- void midpt_split( ANNpointArray pa, // point array ANNidxArray pidx, // point indices (permuted on return) const ANNorthRect &bnds, // bounding rectangle for cell int n, // number of points int dim, // dimension of space int &cut_dim, // cutting dimension (returned) ANNcoord &cut_val, // cutting value (returned) int &n_lo) // num of points on low side (returned) { int d; ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; for (d = 1; d < dim; d++) { // find length of longest box side ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (length > max_length) { max_length = length; } } ANNcoord max_spread = -1; // find long side with most spread for (d = 0; d < dim; d++) { // is it among longest? if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) { // compute its spread ANNcoord spr = annSpread(pa, pidx, n, d); if (spr > max_spread) { // is it max so far? max_spread = spr; cut_dim = d; } } } // split along cut_dim at midpoint cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2; // permute points accordingly int br1, br2; annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); //------------------------------------------------------------------ // On return: pa[0..br1-1] < cut_val // pa[br1..br2-1] == cut_val // pa[br2..n-1] > cut_val // // We can set n_lo to any value in the range [br1..br2]. // We choose split so that points are most evenly divided. //------------------------------------------------------------------ if (br1 > n/2) n_lo = br1; else if (br2 < n/2) n_lo = br2; else n_lo = n/2; } //---------------------------------------------------------------------- // sl_midpt_split - sliding midpoint splitting rule // // This is a modification of midpt_split, which has the nonsensical // name "sliding midpoint". The idea is that we try to use the // midpoint rule, by bisecting the longest side. If there are // ties, the dimension with the maximum spread is selected. If, // however, the midpoint split produces a trivial split (no points // on one side of the splitting plane) then we slide the splitting // (maintaining its orientation) until it produces a nontrivial // split. For example, if the splitting plane is along the x-axis, // and all the data points have x-coordinate less than the x-bisector, // then the split is taken along the maximum x-coordinate of the // data points. // // Intuitively, this rule cannot generate trivial splits, and // hence avoids midpt_split's tendency to produce trees with // a very large number of nodes. // //---------------------------------------------------------------------- void sl_midpt_split( ANNpointArray pa, // point array ANNidxArray pidx, // point indices (permuted on return) const ANNorthRect &bnds, // bounding rectangle for cell int n, // number of points int dim, // dimension of space int &cut_dim, // cutting dimension (returned) ANNcoord &cut_val, // cutting value (returned) int &n_lo) // num of points on low side (returned) { int d; ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; for (d = 1; d < dim; d++) { // find length of longest box side ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (length > max_length) { max_length = length; } } ANNcoord max_spread = -1; // find long side with most spread for (d = 0; d < dim; d++) { // is it among longest? if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) { // compute its spread ANNcoord spr = annSpread(pa, pidx, n, d); if (spr > max_spread) { // is it max so far? max_spread = spr; cut_dim = d; } } } // ideal split at midpoint ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2; ANNcoord min, max; annMinMax(pa, pidx, n, cut_dim, min, max); // find min/max coordinates if (ideal_cut_val < min) // slide to min or max as needed cut_val = min; else if (ideal_cut_val > max) cut_val = max; else cut_val = ideal_cut_val; // permute points accordingly int br1, br2; annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); //------------------------------------------------------------------ // On return: pa[0..br1-1] < cut_val // pa[br1..br2-1] == cut_val // pa[br2..n-1] > cut_val // // We can set n_lo to any value in the range [br1..br2] to satisfy // the exit conditions of the procedure. // // if ideal_cut_val < min (implying br2 >= 1), // then we select n_lo = 1 (so there is one point on left) and // if ideal_cut_val > max (implying br1 <= n-1), // then we select n_lo = n-1 (so there is one point on right). // Otherwise, we select n_lo as close to n/2 as possible within // [br1..br2]. //------------------------------------------------------------------ if (ideal_cut_val < min) n_lo = 1; else if (ideal_cut_val > max) n_lo = n-1; else if (br1 > n/2) n_lo = br1; else if (br2 < n/2) n_lo = br2; else n_lo = n/2; } //---------------------------------------------------------------------- // fair_split - fair-split splitting rule // // This is a compromise between the kd-tree splitting rule (which // always splits data points at their median) and the midpoint // splitting rule (which always splits a box through its center. // The goal of this procedure is to achieve both nicely balanced // splits, and boxes of bounded aspect ratio. // // A constant FS_ASPECT_RATIO is defined. Given a box, those sides // which can be split so that the ratio of the longest to shortest // side does not exceed ASPECT_RATIO are identified. Among these // sides, we select the one in which the points have the largest // spread. We then split the points in a manner which most evenly // distributes the points on either side of the splitting plane, // subject to maintaining the bound on the ratio of long to short // sides. To determine that the aspect ratio will be preserved, // we determine the longest side (other than this side), and // determine how narrowly we can cut this side, without causing the // aspect ratio bound to be exceeded (small_piece). // // This procedure is more robust than either kd_split or midpt_split, // but is more complicated as well. When point distribution is // extremely skewed, this degenerates to midpt_split (actually // 1/3 point split), and when the points are most evenly distributed, // this degenerates to kd-split. //---------------------------------------------------------------------- void fair_split( ANNpointArray pa, // point array ANNidxArray pidx, // point indices (permuted on return) const ANNorthRect &bnds, // bounding rectangle for cell int n, // number of points int dim, // dimension of space int &cut_dim, // cutting dimension (returned) ANNcoord &cut_val, // cutting value (returned) int &n_lo) // num of points on low side (returned) { int d; ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; cut_dim = 0; for (d = 1; d < dim; d++) { // find length of longest box side ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (length > max_length) { max_length = length; cut_dim = d; } } ANNcoord max_spread = 0; // find legal cut with max spread cut_dim = 0; for (d = 0; d < dim; d++) { ANNcoord length = bnds.hi[d] - bnds.lo[d]; // is this side midpoint splitable // without violating aspect ratio? if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) { // compute spread along this dim ANNcoord spr = annSpread(pa, pidx, n, d); if (spr > max_spread) { // best spread so far max_spread = spr; cut_dim = d; // this is dimension to cut } } } max_length = 0; // find longest side other than cut_dim for (d = 0; d < dim; d++) { ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (d != cut_dim && length > max_length) max_length = length; } // consider most extreme splits ANNcoord small_piece = max_length / FS_ASPECT_RATIO; ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut int br1, br2; // is median below lo_cut ? if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) { cut_val = lo_cut; // cut at lo_cut annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = br1; } // is median above hi_cut? else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) { cut_val = hi_cut; // cut at hi_cut annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = br2; } else { // median cut preserves asp ratio n_lo = n/2; // split about median annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); } } //---------------------------------------------------------------------- // sl_fair_split - sliding fair split splitting rule // // Sliding fair split is a splitting rule that combines the // strengths of both fair split with sliding midpoint split. // Fair split tends to produce balanced splits when the points // are roughly uniformly distributed, but it can produce many // trivial splits when points are highly clustered. Sliding // midpoint never produces trivial splits, and shrinks boxes // nicely if points are highly clustered, but it may produce // rather unbalanced splits when points are unclustered but not // quite uniform. // // Sliding fair split is based on the theory that there are two // types of splits that are "good": balanced splits that produce // fat boxes, and unbalanced splits provided the cell with fewer // points is fat. // // This splitting rule operates by first computing the longest // side of the current bounding box. Then it asks which sides // could be split (at the midpoint) and still satisfy the aspect // ratio bound with respect to this side. Among these, it selects // the side with the largest spread (as fair split would). It // then considers the most extreme cuts that would be allowed by // the aspect ratio bound. This is done by dividing the longest // side of the box by the aspect ratio bound. If the median cut // lies between these extreme cuts, then we use the median cut. // If not, then consider the extreme cut that is closer to the // median. If all the points lie to one side of this cut, then // we slide the cut until it hits the first point. This may // violate the aspect ratio bound, but will never generate empty // cells. However the sibling of every such skinny cell is fat, // and hence packing arguments still apply. // //---------------------------------------------------------------------- void sl_fair_split( ANNpointArray pa, // point array ANNidxArray pidx, // point indices (permuted on return) const ANNorthRect &bnds, // bounding rectangle for cell int n, // number of points int dim, // dimension of space int &cut_dim, // cutting dimension (returned) ANNcoord &cut_val, // cutting value (returned) int &n_lo) // num of points on low side (returned) { int d; ANNcoord min, max; // min/max coordinates int br1, br2; // split break points ANNcoord max_length = bnds.hi[0] - bnds.lo[0]; cut_dim = 0; for (d = 1; d < dim; d++) { // find length of longest box side ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (length > max_length) { max_length = length; cut_dim = d; } } ANNcoord max_spread = 0; // find legal cut with max spread cut_dim = 0; for (d = 0; d < dim; d++) { ANNcoord length = bnds.hi[d] - bnds.lo[d]; // is this side midpoint splitable // without violating aspect ratio? if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) { // compute spread along this dim ANNcoord spr = annSpread(pa, pidx, n, d); if (spr > max_spread) { // best spread so far max_spread = spr; cut_dim = d; // this is dimension to cut } } } max_length = 0; // find longest side other than cut_dim for (d = 0; d < dim; d++) { ANNcoord length = bnds.hi[d] - bnds.lo[d]; if (d != cut_dim && length > max_length) max_length = length; } // consider most extreme splits ANNcoord small_piece = max_length / FS_ASPECT_RATIO; ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut // find min and max along cut_dim annMinMax(pa, pidx, n, cut_dim, min, max); // is median below lo_cut? if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) { if (max > lo_cut) { // are any points above lo_cut? cut_val = lo_cut; // cut at lo_cut annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = br1; // balance if there are ties } else { // all points below lo_cut cut_val = max; // cut at max value annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = n-1; } } // is median above hi_cut? else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) { if (min < hi_cut) { // are any points below hi_cut? cut_val = hi_cut; // cut at hi_cut annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = br2; // balance if there are ties } else { // all points above hi_cut cut_val = min; // cut at min value annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2); n_lo = 1; } } else { // median cut is good enough n_lo = n/2; // split about median annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo); } }