histogram.go 2.0 KB
Newer Older
Y
Your Name 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
// Package histogram provides a Go implementation of BigML's histogram package
// for Clojure/Java. It is currently experimental.
package histogram

import (
	"container/heap"
	"math"
	"sort"
)

type Bin struct {
	Count int
	Sum   float64
}

func (b *Bin) Update(x *Bin) {
	b.Count += x.Count
	b.Sum += x.Sum
}

func (b *Bin) Mean() float64 {
	return b.Sum / float64(b.Count)
}

type Bins []*Bin

func (bs Bins) Len() int           { return len(bs) }
func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() }
func (bs Bins) Swap(i, j int)      { bs[i], bs[j] = bs[j], bs[i] }

func (bs *Bins) Push(x interface{}) {
	*bs = append(*bs, x.(*Bin))
}

func (bs *Bins) Pop() interface{} {
	return bs.remove(len(*bs) - 1)
}

func (bs *Bins) remove(n int) *Bin {
	if n < 0 || len(*bs) < n {
		return nil
	}
	x := (*bs)[n]
	*bs = append((*bs)[:n], (*bs)[n+1:]...)
	return x
}

type Histogram struct {
	res *reservoir
}

func New(maxBins int) *Histogram {
	return &Histogram{res: newReservoir(maxBins)}
}

func (h *Histogram) Insert(f float64) {
	h.res.insert(&Bin{1, f})
	h.res.compress()
}

func (h *Histogram) Bins() Bins {
	return h.res.bins
}

type reservoir struct {
	n       int
	maxBins int
	bins    Bins
}

func newReservoir(maxBins int) *reservoir {
	return &reservoir{maxBins: maxBins}
}

func (r *reservoir) insert(bin *Bin) {
	r.n += bin.Count
	i := sort.Search(len(r.bins), func(i int) bool {
		return r.bins[i].Mean() >= bin.Mean()
	})
	if i < 0 || i == r.bins.Len() {
		// TODO(blake): Maybe use an .insert(i, bin) instead of
		// performing the extra work of a heap.Push.
		heap.Push(&r.bins, bin)
		return
	}
	r.bins[i].Update(bin)
}

func (r *reservoir) compress() {
	for r.bins.Len() > r.maxBins {
		minGapIndex := -1
		minGap := math.MaxFloat64
		for i := 0; i < r.bins.Len()-1; i++ {
			gap := gapWeight(r.bins[i], r.bins[i+1])
			if minGap > gap {
				minGap = gap
				minGapIndex = i
			}
		}
		prev := r.bins[minGapIndex]
		next := r.bins.remove(minGapIndex + 1)
		prev.Update(next)
	}
}

func gapWeight(prev, next *Bin) float64 {
	return next.Mean() - prev.Mean()
}