diff --git a/docs/libs/ml/distance_metrics.md b/docs/libs/ml/distance_metrics.md new file mode 100644 index 0000000000000000000000000000000000000000..9d87fa181943b58d74b10ac3d34e5be37cc625bb --- /dev/null +++ b/docs/libs/ml/distance_metrics.md @@ -0,0 +1,56 @@ +--- +title: Distance Metrics +--- + + +* This will be replaced by the TOC +{:toc} + +## Description + +Different metrics of distance are convenient for different types of analysis. Flink Machine +Learning Library provides built-in implementations for many standard distance metric. You can also +use other distance metric by implementing `DistanceMetric` trait. + +## Built-in Implementations + +* Euclidean Distance +* Squared Euclidean Distance +* Cosine Distance +* Tanimoto Distance +* Chebyshev Distance +* Manhattan Distance +* Minkowski Distance + +## Custom Implementation + +You can use your own distance metric by implementing `DistanceMetric` trait. + +{% highlight scala %} +class MyDistance extends DistanceMetric { + override def distance(a: Vector, b: Vector) = ... // your implementation for distance metric +} + +object MyDistance { + def apply() = new MyDistance() +} + +val metric = MyDistance() +{% endhighlight %} diff --git a/docs/libs/ml/index.md b/docs/libs/ml/index.md index 9753e68a59f542228d32813398912569c2254f55..0754045a681600c33a133c803784473455c78f23 100644 --- a/docs/libs/ml/index.md +++ b/docs/libs/ml/index.md @@ -36,4 +36,8 @@ under the License. * [Communication efficient distributed dual coordinate ascent (CoCoA)](cocoa.html) * [Multiple linear regression](multiple_linear_regression.html) * [Polynomial Base Feature Mapper](polynomial_base_feature_mapper.html) -* [Standard Scaler](standard_scaler.html) \ No newline at end of file +* [Standard Scaler](standard_scaler.html) + +## Metrics + +* [Distance Metrics](distance_metrics.html) diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/DenseVector.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/DenseVector.scala index e5c618714b2b7edd9c9878b1494624c287f28422..851a283143fd4a411b04609c3ff3577abb53c737 100644 --- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/DenseVector.scala +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/DenseVector.scala @@ -80,6 +80,29 @@ case class DenseVector(val data: Array[Double]) extends Vector with Serializable data(index) = value } + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + override def dot(other: Vector): Double = { + require(size == other.size, "The size of vector must be equal.") + + other match { + case SparseVector(_, otherIndices, otherData) => + otherIndices.zipWithIndex.map { + case (idx, sparseIdx) => data(idx) * otherData(sparseIdx) + }.sum + case _ => (0 until size).map(i => data(i) * other(i)).sum + } + } + + /** Magnitude of a vector + * + * @return + */ + override def magnitude: Double = math.sqrt(data.map(x => x * x).sum) + def toSparseVector: SparseVector = { val nonZero = (0 until size).zip(data).filter(_._2 != 0) diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseVector.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseVector.scala index 6cf4c6318bd8f14079883f3414e65ac9f97e3285..0762efb01bc3c4e7545dd34d5f476e2581a0e33a 100644 --- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseVector.scala +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/SparseVector.scala @@ -23,7 +23,7 @@ import scala.util.Sorting /** Sparse vector implementation storing the data in two arrays. One index contains the sorted * indices of the non-zero vector entries and the other the corresponding vector entries */ -class SparseVector( +case class SparseVector( val size: Int, val indices: Array[Int], val data: Array[Double]) @@ -53,6 +53,42 @@ class SparseVector( new SparseVector(size, indices.clone, data.clone) } + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + override def dot(other: Vector): Double = { + require(size == other.size, "The size of vector must be equal.") + other match { + case DenseVector(otherData) => + indices.zipWithIndex.map { case (sparseIdx, idx) => data(idx) * otherData(sparseIdx) }.sum + case SparseVector(_, otherIndices, otherData) => + var left = 0 + var right = 0 + var result = 0.0 + + while (left < indices.length && right < otherIndices.length) { + if (indices(left) < otherIndices(right)) { + left += 1 + } else if (otherIndices(right) < indices(left)) { + right += 1 + } else { + result += data(left) * otherData(right) + left += 1 + right += 1 + } + } + result + } + } + + /** Magnitude of a vector + * + * @return + */ + override def magnitude: Double = math.sqrt(data.map(x => x * x).sum) + /** Element wise access function * * * @param index index of the accessed element diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/Vector.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/Vector.scala index ef6b7aa8f65bda3b35aff2d282af90fdbd607cea..83e0c65aa9d054fc7b1360244001de8d1a7aa428 100644 --- a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/Vector.scala +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/math/Vector.scala @@ -49,6 +49,19 @@ trait Vector { */ def copy: Vector + /** Returns the dot product of the recipient and the argument + * + * @param other a Vector + * @return a scalar double of dot product + */ + def dot(other: Vector): Double + + /** Magnitude of a vector + * + * @return + */ + def magnitude: Double + def equalsVector(vector: Vector): Boolean = { if(size == vector.size) { (0 until size) forall { idx => diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..055ede3d0df12b65bf3229079c70c08a5b4fcfc3 --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ChebyshevDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Chebyshev distance metric. The class calculates the distance between + * the given vectors by finding the maximum difference between each coordinate. + * + * @see http://en.wikipedia.org/wiki/Chebyshev_distance + */ +class ChebyshevDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.abs(a(i) - b(i))).max + } +} + +object ChebyshevDistanceMetric { + def apply() = new ChebyshevDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..f32ea2634be0b45e2cf794269bce3ccf6ac5ee5b --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/CosineDistanceMetric.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a cosine distance metric. The class calculates the distance between + * the given vectors by dividing the dot product of two vectors by the product of their lengths. + * We convert the result of division to a usable distance. So, 1 - cos(angle) is actually returned. + * + * @see http://en.wikipedia.org/wiki/Cosine_similarity + */ +class CosineDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + + val dotProd: Double = a.dot(b) + val denominator: Double = a.magnitude * b.magnitude + if (dotProd == 0 && denominator == 0) { + 0 + } else { + 1 - dotProd / denominator + } + } +} + +object CosineDistanceMetric { + def apply() = new CosineDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..1297ffb520a57cbbf14d85ad12d8e2df33adb694 --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/DistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** DistanceMeasure interface is used for object which determines distance between two points. + */ +trait DistanceMetric { + /** Returns the distance between the arguments. + * + * @param a a Vector defining a multi-dimensional point in some space + * @param b a Vector defining a multi-dimensional point in some space + * @return a scalar double of the distance + */ + def distance(a: Vector, b: Vector): Double + + protected def checkValidArguments(a: Vector, b: Vector) = { + require(a.size == b.size, "The each size of vectors must be same to calculate distance.") + } +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..153fb93a8746f4be405c157b5cbe5dd85b37abd8 --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/EuclideanDistanceMetric.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Euclidean distance metric. The metric calculates the distance between + * the given two vectors by summing the square root of the squared differences between + * each coordinate. + * + * http://en.wikipedia.org/wiki/Euclidean_distance + * + * If you don't care about the true distance and only need for comparison, + * [[SquaredEuclideanDistanceMetric]] will be faster because it doesn't calculate the actual + * square root of the distances. + * + * @see http://en.wikipedia.org/wiki/Euclidean_distance + */ +class EuclideanDistanceMetric extends SquaredEuclideanDistanceMetric { + override def distance(a: Vector, b: Vector): Double = math.sqrt(super.distance(a, b)) +} + +object EuclideanDistanceMetric { + def apply() = new EuclideanDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..5983f79e3774778abf90fae149ae1bc4c8a472de --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/ManhattanDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Manhattan distance metric. The class calculates the distance between + * the given vectors by summing the differences between each coordinate. + * + * @see http://en.wikipedia.org/wiki/Taxicab_geometry + */ +class ManhattanDistanceMetric extends DistanceMetric{ + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.abs(a(i) - b(i))).sum + } +} + +object ManhattanDistanceMetric { + def apply() = new ManhattanDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..50161d46a3c505d9b97f2ac1240f54e081f58d54 --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/MinkowskiDistanceMetric.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Minkowski distance metric. The metric is a generalization of + * L(p) distances: Euclidean distance and Manhattan distance. If you need for a special case of + * p = 1 or p = 2, use [[ManhattanDistanceMetric]], [[EuclideanDistanceMetric]]. This class is + * useful for high exponents. + * + * @param p the norm exponent of space + * + * @see http://en.wikipedia.org/wiki/Minkowski_distance + */ +class MinkowskiDistanceMetric(val p: Double) extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + math.pow((0 until a.size).map(i => math.pow(math.abs(a(i) - b(i)), p)).sum, 1 / p) + } +} + +object MinkowskiDistanceMetric { + def apply(p: Double) = new MinkowskiDistanceMetric(p) +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..fe546e98eac1bdd31793606fb9467b697b015d23 --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/SquaredEuclideanDistanceMetric.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class is like [[EuclideanDistanceMetric]] but it does not take the square root. + * + * The value calculated by this class is not exact Euclidean distance, but it saves on computation + * when you need the value for only comparison. + */ +class SquaredEuclideanDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + (0 until a.size).map(i => math.pow(a(i) - b(i), 2)).sum + } +} + +object SquaredEuclideanDistanceMetric { + def apply() = new SquaredEuclideanDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala new file mode 100644 index 0000000000000000000000000000000000000000..5141c98a8f9a859312e1d078d8f022ad03a8119b --- /dev/null +++ b/flink-staging/flink-ml/src/main/scala/org/apache/flink/ml/metrics/distances/TanimotoDistanceMetric.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.Vector + +/** This class implements a Tanimoto distance metric. The class calculates the distance between + * the given vectors. The vectors are assumed as bit-wise vectors. We convert the result of + * division to a usable distance. So, 1 - similarity is actually returned. + * + * @see http://en.wikipedia.org/wiki/Jaccard_index + */ +class TanimotoDistanceMetric extends DistanceMetric { + override def distance(a: Vector, b: Vector): Double = { + checkValidArguments(a, b) + + val dotProd: Double = a.dot(b) + 1 - dotProd / (a.magnitude * a.magnitude + b.magnitude * b.magnitude - dotProd) + } +} + +object TanimotoDistanceMetric { + def apply() = new TanimotoDistanceMetric() +} diff --git a/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala index 553f672d0d6fd5fc4df7cb3ba244e090517e394a..4146b8d9c12bdb9127f8cc33b992e0c09cbd531a 100644 --- a/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala +++ b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/DenseVectorSuite.scala @@ -47,4 +47,40 @@ class DenseVectorSuite extends FlatSpec with Matchers { vector(size) } } + + it should "calculate dot product with DenseVector" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + vec1.dot(vec2) should be(0) + } + + it should "calculate dot product with SparseVector" in { + val vec1 = DenseVector(Array(1, 0, 1)) + val vec2 = SparseVector.fromCOO(3, (0, 1), (1, 1)) + + vec1.dot(vec2) should be(1) + } + + it should "calculate dot product with SparseVector 2" in { + val vec1 = DenseVector(Array(1, 0, 1, 0, 0)) + val vec2 = SparseVector.fromCOO(5, (2, 1), (4, 1)) + + vec1.dot(vec2) should be(1) + } + + it should "fail in case of calculation dot product with different size vector" in { + val vec1 = DenseVector(Array(1, 0)) + val vec2 = DenseVector(Array(0)) + + intercept[IllegalArgumentException] { + vec1.dot(vec2) + } + } + + it should "calculate magnitude of vector" in { + val vec = DenseVector(Array(1, 4, 8)) + + vec.magnitude should be(9) + } } diff --git a/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala index cde141c0862ecbafe0bc8388da4014ffea06a7fb..474fb233c43d7f34c8965735300a6d32dd9ba968 100644 --- a/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala +++ b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/math/SparseVectorSuite.scala @@ -97,4 +97,40 @@ class SparseVectorSuite extends FlatSpec with Matchers { sparseVector should not equal(copy) } + + it should "calculate dot product with SparseVector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = SparseVector.fromCOO(4, (1, 1), (3, 1)) + + vec1.dot(vec2) should be(0) + } + + it should "calculate dot product with SparseVector 2" in { + val vec1 = SparseVector.fromCOO(5, (2, 3), (4, 1)) + val vec2 = SparseVector.fromCOO(5, (4, 2), (2, 1)) + + vec1.dot(vec2) should be(5) + } + + it should "calculate dot product with DenseVector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = DenseVector(Array(0, 1, 0, 1)) + + vec1.dot(vec2) should be(0) + } + + it should "fail in case of calculation dot product with different size vector" in { + val vec1 = SparseVector.fromCOO(4, (0, 1), (2, 1)) + val vec2 = DenseVector(Array(0, 1, 0)) + + intercept[IllegalArgumentException] { + vec1.dot(vec2) + } + } + + it should "calculate magnitude of vector" in { + val vec = SparseVector.fromCOO(3, (0, 1), (1, 4), (2, 8)) + + vec.magnitude should be(9) + } } diff --git a/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala new file mode 100644 index 0000000000000000000000000000000000000000..569a1a8b085199a805f2925a8b1638465a2d54a1 --- /dev/null +++ b/flink-staging/flink-ml/src/test/scala/org/apache/flink/ml/metrics/distances/DistanceMetricSuite.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.metrics.distances + +import org.apache.flink.ml.math.DenseVector +import org.scalatest.{Matchers, FlatSpec} + +class DistanceMetricSuite extends FlatSpec with Matchers { + val EPSILON = 1e-8 + + behavior of "Distance Measures" + + it should "calculate Euclidean distance correctly" in { + val vec1 = DenseVector(1, 9) + val vec2 = DenseVector(5, 6) + + EuclideanDistanceMetric().distance(vec1, vec2) should be(5) + } + + it should "calculate square value of Euclidean distance correctly" in { + val vec1 = DenseVector(1, 9) + val vec2 = DenseVector(5, 6) + + SquaredEuclideanDistanceMetric().distance(vec1, vec2) should be(25) + } + + it should "calculate Chebyshev distance correctly" in { + val vec1 = DenseVector(0, 3, 6) + val vec2 = DenseVector(0, 0, 0) + + ChebyshevDistanceMetric().distance(vec1, vec2) should be(6) + } + + it should "calculate Cosine distance correctly" in { + val vec1 = DenseVector(1, 0) + val vec2 = DenseVector(2, 2) + + CosineDistanceMetric().distance(vec1, vec2) should be((1 - math.sqrt(2) / 2) +- EPSILON) + } + + it should "calculate Manhattan distance correctly" in { + val vec1 = DenseVector(0, 0, 0, 1, 1, 1) + val vec2 = DenseVector(1, 1, 1, 0, 0, 0) + + ManhattanDistanceMetric().distance(vec1, vec2) should be(6) + } + + it should "calculate Minkowski distance correctly" in { + val vec1 = DenseVector(0, 0, 1, 1, 0) + val vec2 = DenseVector(1, 1, 0, 1, 2) + + MinkowskiDistanceMetric(3).distance(vec1, vec2) should be(math.pow(11, 1.0 / 3) +- EPSILON) + } + + it should "calculate Tanimoto distance correctly" in { + val vec1 = DenseVector(0, 1, 1) + val vec2 = DenseVector(1, 1, 0) + + TanimotoDistanceMetric().distance(vec1, vec2) should be(1 - (1.0 / (2 + 2 - 1)) +- EPSILON) + } +}