提交 ddb2b347 编写于 作者: C Chiwan Park 提交者: Till Rohrmann

[FLINK-1933] [ml] Adds distance metrics and documentation to machine learning library

[ml] [docs] Add a overview documentation for Distance Metrics

[FLINK-1933] [ml] Moves metrics.distances package to org.apache.flink.ml

This closes #629.
上级 2f08d9d5
---
title: Distance Metrics
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
* This will be replaced by the TOC
{:toc}
## Description
Different metrics of distance are convenient for different types of analysis. Flink Machine
Learning Library provides built-in implementations for many standard distance metric. You can also
use other distance metric by implementing `DistanceMetric` trait.
## Built-in Implementations
* Euclidean Distance
* Squared Euclidean Distance
* Cosine Distance
* Tanimoto Distance
* Chebyshev Distance
* Manhattan Distance
* Minkowski Distance
## Custom Implementation
You can use your own distance metric by implementing `DistanceMetric` trait.
{% highlight scala %}
class MyDistance extends DistanceMetric {
override def distance(a: Vector, b: Vector) = ... // your implementation for distance metric
}
object MyDistance {
def apply() = new MyDistance()
}
val metric = MyDistance()
{% endhighlight %}
......@@ -36,4 +36,8 @@ under the License.
* [Communication efficient distributed dual coordinate ascent (CoCoA)](cocoa.html)
* [Multiple linear regression](multiple_linear_regression.html)
* [Polynomial Base Feature Mapper](polynomial_base_feature_mapper.html)
* [Standard Scaler](standard_scaler.html)
\ No newline at end of file
* [Standard Scaler](standard_scaler.html)
## Metrics
* [Distance Metrics](distance_metrics.html)
......@@ -80,6 +80,29 @@ case class DenseVector(val data: Array[Double]) extends Vector with Serializable
data(index) = value
}
/** Returns the dot product of the recipient and the argument
*
* @param other a Vector
* @return a scalar double of dot product
*/
override def dot(other: Vector): Double = {
require(size == other.size, "The size of vector must be equal.")
other match {
case SparseVector(_, otherIndices, otherData) =>
otherIndices.zipWithIndex.map {
case (idx, sparseIdx) => data(idx) * otherData(sparseIdx)
}.sum
case _ => (0 until size).map(i => data(i) * other(i)).sum
}
}
/** Magnitude of a vector
*
* @return
*/
override def magnitude: Double = math.sqrt(data.map(x => x * x).sum)
def toSparseVector: SparseVector = {
val nonZero = (0 until size).zip(data).filter(_._2 != 0)
......
......@@ -23,7 +23,7 @@ import scala.util.Sorting
/** Sparse vector implementation storing the data in two arrays. One index contains the sorted
* indices of the non-zero vector entries and the other the corresponding vector entries
*/
class SparseVector(
case class SparseVector(
val size: Int,
val indices: Array[Int],
val data: Array[Double])
......@@ -53,6 +53,42 @@ class SparseVector(
new SparseVector(size, indices.clone, data.clone)
}
/** Returns the dot product of the recipient and the argument
*
* @param other a Vector
* @return a scalar double of dot product
*/
override def dot(other: Vector): Double = {
require(size == other.size, "The size of vector must be equal.")
other match {
case DenseVector(otherData) =>
indices.zipWithIndex.map { case (sparseIdx, idx) => data(idx) * otherData(sparseIdx) }.sum
case SparseVector(_, otherIndices, otherData) =>
var left = 0
var right = 0
var result = 0.0
while (left < indices.length && right < otherIndices.length) {
if (indices(left) < otherIndices(right)) {
left += 1
} else if (otherIndices(right) < indices(left)) {
right += 1
} else {
result += data(left) * otherData(right)
left += 1
right += 1
}
}
result
}
}
/** Magnitude of a vector
*
* @return
*/
override def magnitude: Double = math.sqrt(data.map(x => x * x).sum)
/** Element wise access function
*
* * @param index index of the accessed element
......
......@@ -49,6 +49,19 @@ trait Vector {
*/
def copy: Vector
/** Returns the dot product of the recipient and the argument
*
* @param other a Vector
* @return a scalar double of dot product
*/
def dot(other: Vector): Double
/** Magnitude of a vector
*
* @return
*/
def magnitude: Double
def equalsVector(vector: Vector): Boolean = {
if(size == vector.size) {
(0 until size) forall { idx =>
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a Chebyshev distance metric. The class calculates the distance between
* the given vectors by finding the maximum difference between each coordinate.
*
* @see http://en.wikipedia.org/wiki/Chebyshev_distance
*/
class ChebyshevDistanceMetric extends DistanceMetric {
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
(0 until a.size).map(i => math.abs(a(i) - b(i))).max
}
}
object ChebyshevDistanceMetric {
def apply() = new ChebyshevDistanceMetric()
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a cosine distance metric. The class calculates the distance between
* the given vectors by dividing the dot product of two vectors by the product of their lengths.
* We convert the result of division to a usable distance. So, 1 - cos(angle) is actually returned.
*
* @see http://en.wikipedia.org/wiki/Cosine_similarity
*/
class CosineDistanceMetric extends DistanceMetric {
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
val dotProd: Double = a.dot(b)
val denominator: Double = a.magnitude * b.magnitude
if (dotProd == 0 && denominator == 0) {
0
} else {
1 - dotProd / denominator
}
}
}
object CosineDistanceMetric {
def apply() = new CosineDistanceMetric()
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** DistanceMeasure interface is used for object which determines distance between two points.
*/
trait DistanceMetric {
/** Returns the distance between the arguments.
*
* @param a a Vector defining a multi-dimensional point in some space
* @param b a Vector defining a multi-dimensional point in some space
* @return a scalar double of the distance
*/
def distance(a: Vector, b: Vector): Double
protected def checkValidArguments(a: Vector, b: Vector) = {
require(a.size == b.size, "The each size of vectors must be same to calculate distance.")
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a Euclidean distance metric. The metric calculates the distance between
* the given two vectors by summing the square root of the squared differences between
* each coordinate.
*
* http://en.wikipedia.org/wiki/Euclidean_distance
*
* If you don't care about the true distance and only need for comparison,
* [[SquaredEuclideanDistanceMetric]] will be faster because it doesn't calculate the actual
* square root of the distances.
*
* @see http://en.wikipedia.org/wiki/Euclidean_distance
*/
class EuclideanDistanceMetric extends SquaredEuclideanDistanceMetric {
override def distance(a: Vector, b: Vector): Double = math.sqrt(super.distance(a, b))
}
object EuclideanDistanceMetric {
def apply() = new EuclideanDistanceMetric()
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a Manhattan distance metric. The class calculates the distance between
* the given vectors by summing the differences between each coordinate.
*
* @see http://en.wikipedia.org/wiki/Taxicab_geometry
*/
class ManhattanDistanceMetric extends DistanceMetric{
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
(0 until a.size).map(i => math.abs(a(i) - b(i))).sum
}
}
object ManhattanDistanceMetric {
def apply() = new ManhattanDistanceMetric()
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a Minkowski distance metric. The metric is a generalization of
* L(p) distances: Euclidean distance and Manhattan distance. If you need for a special case of
* p = 1 or p = 2, use [[ManhattanDistanceMetric]], [[EuclideanDistanceMetric]]. This class is
* useful for high exponents.
*
* @param p the norm exponent of space
*
* @see http://en.wikipedia.org/wiki/Minkowski_distance
*/
class MinkowskiDistanceMetric(val p: Double) extends DistanceMetric {
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
math.pow((0 until a.size).map(i => math.pow(math.abs(a(i) - b(i)), p)).sum, 1 / p)
}
}
object MinkowskiDistanceMetric {
def apply(p: Double) = new MinkowskiDistanceMetric(p)
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class is like [[EuclideanDistanceMetric]] but it does not take the square root.
*
* The value calculated by this class is not exact Euclidean distance, but it saves on computation
* when you need the value for only comparison.
*/
class SquaredEuclideanDistanceMetric extends DistanceMetric {
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
(0 until a.size).map(i => math.pow(a(i) - b(i), 2)).sum
}
}
object SquaredEuclideanDistanceMetric {
def apply() = new SquaredEuclideanDistanceMetric()
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.Vector
/** This class implements a Tanimoto distance metric. The class calculates the distance between
* the given vectors. The vectors are assumed as bit-wise vectors. We convert the result of
* division to a usable distance. So, 1 - similarity is actually returned.
*
* @see http://en.wikipedia.org/wiki/Jaccard_index
*/
class TanimotoDistanceMetric extends DistanceMetric {
override def distance(a: Vector, b: Vector): Double = {
checkValidArguments(a, b)
val dotProd: Double = a.dot(b)
1 - dotProd / (a.magnitude * a.magnitude + b.magnitude * b.magnitude - dotProd)
}
}
object TanimotoDistanceMetric {
def apply() = new TanimotoDistanceMetric()
}
......@@ -47,4 +47,40 @@ class DenseVectorSuite extends FlatSpec with Matchers {
vector(size)
}
}
it should "calculate dot product with DenseVector" in {
val vec1 = DenseVector(Array(1, 0, 1))
val vec2 = DenseVector(Array(0, 1, 0))
vec1.dot(vec2) should be(0)
}
it should "calculate dot product with SparseVector" in {
val vec1 = DenseVector(Array(1, 0, 1))
val vec2 = SparseVector.fromCOO(3, (0, 1), (1, 1))
vec1.dot(vec2) should be(1)
}
it should "calculate dot product with SparseVector 2" in {
val vec1 = DenseVector(Array(1, 0, 1, 0, 0))
val vec2 = SparseVector.fromCOO(5, (2, 1), (4, 1))
vec1.dot(vec2) should be(1)
}
it should "fail in case of calculation dot product with different size vector" in {
val vec1 = DenseVector(Array(1, 0))
val vec2 = DenseVector(Array(0))
intercept[IllegalArgumentException] {
vec1.dot(vec2)
}
}
it should "calculate magnitude of vector" in {
val vec = DenseVector(Array(1, 4, 8))
vec.magnitude should be(9)
}
}
......@@ -97,4 +97,40 @@ class SparseVectorSuite extends FlatSpec with Matchers {
sparseVector should not equal(copy)
}
it should "calculate dot product with SparseVector" in {
val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1))
val vec2 = SparseVector.fromCOO(4, (1, 1), (3, 1))
vec1.dot(vec2) should be(0)
}
it should "calculate dot product with SparseVector 2" in {
val vec1 = SparseVector.fromCOO(5, (2, 3), (4, 1))
val vec2 = SparseVector.fromCOO(5, (4, 2), (2, 1))
vec1.dot(vec2) should be(5)
}
it should "calculate dot product with DenseVector" in {
val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1))
val vec2 = DenseVector(Array(0, 1, 0, 1))
vec1.dot(vec2) should be(0)
}
it should "fail in case of calculation dot product with different size vector" in {
val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1))
val vec2 = DenseVector(Array(0, 1, 0))
intercept[IllegalArgumentException] {
vec1.dot(vec2)
}
}
it should "calculate magnitude of vector" in {
val vec = SparseVector.fromCOO(3, (0, 1), (1, 4), (2, 8))
vec.magnitude should be(9)
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.metrics.distances
import org.apache.flink.ml.math.DenseVector
import org.scalatest.{Matchers, FlatSpec}
class DistanceMetricSuite extends FlatSpec with Matchers {
val EPSILON = 1e-8
behavior of "Distance Measures"
it should "calculate Euclidean distance correctly" in {
val vec1 = DenseVector(1, 9)
val vec2 = DenseVector(5, 6)
EuclideanDistanceMetric().distance(vec1, vec2) should be(5)
}
it should "calculate square value of Euclidean distance correctly" in {
val vec1 = DenseVector(1, 9)
val vec2 = DenseVector(5, 6)
SquaredEuclideanDistanceMetric().distance(vec1, vec2) should be(25)
}
it should "calculate Chebyshev distance correctly" in {
val vec1 = DenseVector(0, 3, 6)
val vec2 = DenseVector(0, 0, 0)
ChebyshevDistanceMetric().distance(vec1, vec2) should be(6)
}
it should "calculate Cosine distance correctly" in {
val vec1 = DenseVector(1, 0)
val vec2 = DenseVector(2, 2)
CosineDistanceMetric().distance(vec1, vec2) should be((1 - math.sqrt(2) / 2) +- EPSILON)
}
it should "calculate Manhattan distance correctly" in {
val vec1 = DenseVector(0, 0, 0, 1, 1, 1)
val vec2 = DenseVector(1, 1, 1, 0, 0, 0)
ManhattanDistanceMetric().distance(vec1, vec2) should be(6)
}
it should "calculate Minkowski distance correctly" in {
val vec1 = DenseVector(0, 0, 1, 1, 0)
val vec2 = DenseVector(1, 1, 0, 1, 2)
MinkowskiDistanceMetric(3).distance(vec1, vec2) should be(math.pow(11, 1.0 / 3) +- EPSILON)
}
it should "calculate Tanimoto distance correctly" in {
val vec1 = DenseVector(0, 1, 1)
val vec2 = DenseVector(1, 1, 0)
TanimotoDistanceMetric().distance(vec1, vec2) should be(1 - (1.0 / (2 + 2 - 1)) +- EPSILON)
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册