While working on a Hadoop project I found myself needing a lightweight Java class that computes box plot statistics (e.g. quartiles, outliers, etc.). So I wrote the class appended below. The code does not display the plots, only computes necessary values, since I’m planning on displaying the plots with matplotlib:
import java.util.ArrayList; import java.util.Collections; public class BoxPlotStats { double first_quartile; double median; double third_quartile; Double upper_max_within; Double lower_max_within; ArrayList<Double> upper_outliers; ArrayList<Double> lower_outliers; // this method adapted from http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ public static double percentile(ArrayList values, double percent) { double k = (values.size() - 1) * percent; double f = Math.floor(k); double c = Math.ceil(k); if (f == c) { return (Double) values.get((int) k); } double d0 = ((Double) values.get((int) f)) * (c - k); double d1 = ((Double) values.get((int) c)) * (k - f); return d0 + d1; } public void display() { System.out.println(); System.out.println("Lower outliers: " + lower_outliers); System.out.println("Lower whisker: " + lower_max_within); System.out.println("First quartile: " + first_quartile); System.out.println("Median: " + median); System.out.println("Third quartile: " + third_quartile); System.out.println("Upper whisker: " + upper_max_within); System.out.println("Upper outliers: " + upper_outliers); System.out.println(); } public void displayOneLiner() { String lo = lower_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", ""); String uo = upper_outliers.toString().replace(" ", "|").replace(",", "").replace("[", "").replace("]", ""); System.out.println(lo + "," + lower_max_within + "," + first_quartile + "," + median + "," + third_quartile + "," + upper_max_within + "," + uo); } BoxPlotStats(ArrayList values) { Collections.sort(values); first_quartile = percentile(values, 0.25); median = percentile(values, 0.5); third_quartile = percentile(values, 0.75); double range_limits = 1.5 * (third_quartile - first_quartile); double limit_upper_whisker = third_quartile + range_limits; upper_outliers = new ArrayList(); upper_max_within = (Double) Collections.min(values); double limit_lower_whisker = first_quartile - range_limits; lower_outliers = new ArrayList(); lower_max_within = (Double) Collections.max(values); //for (Double i : values) { // not sure why this isn't working! for (int x=0; x<values.size(); x++) { Double i = (Double) values.get(x); if (i > limit_upper_whisker) { upper_outliers.add(i); } if (i < limit_lower_whisker) { lower_outliers.add(i); } if (i > upper_max_within & i <= limit_upper_whisker) { upper_max_within = i; } if (i < lower_max_within & i >= limit_lower_whisker) { lower_max_within = i; } } } }