From 852d0ab92b41b4a8b85a2d134ddffc8dfd8b608a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 25 Feb 2022 09:48:23 +0000 Subject: [PATCH] dtw metric for tts, test=doc --- paddleaudio/CHANGELOG.md | 1 + paddleaudio/paddleaudio/metric/__init__.py | 2 + paddleaudio/paddleaudio/metric/dtw.py | 42 +++++++++++++++++++ paddleaudio/paddleaudio/metric/mcd.py | 47 ++++++++++++++++++++++ paddleaudio/setup.py | 2 + 5 files changed, 94 insertions(+) create mode 100644 paddleaudio/paddleaudio/metric/dtw.py create mode 100644 paddleaudio/paddleaudio/metric/mcd.py diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md index 52d44dd3..91b0fef0 100644 --- a/paddleaudio/CHANGELOG.md +++ b/paddleaudio/CHANGELOG.md @@ -2,3 +2,4 @@ Date: 2022-2-25, Author: Hui Zhang. - Refactor architecture. + - dtw distance and mcd style dtw diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py index 97043fd7..a96530ff 100644 --- a/paddleaudio/paddleaudio/metric/__init__.py +++ b/paddleaudio/paddleaudio/metric/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .dtw import dtw_distance +from .mcd import mcd_distance diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py new file mode 100644 index 00000000..d27f56e2 --- /dev/null +++ b/paddleaudio/paddleaudio/metric/dtw.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from dtaidistance import dtw_ndim + +__all__ = [ + 'dtw_distance', +] + + +def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float: + """dtw distance + + Dynamic Time Warping. + This function keeps a compact matrix, not the full warping paths matrix. + Uses dynamic programming to compute: + + wps[i, j] = (s1[i]-s2[j])**2 + min( + wps[i-1, j ] + penalty, // vertical / insertion / expansion + wps[i , j-1] + penalty, // horizontal / deletion / compression + wps[i-1, j-1]) // diagonal / match + dtw = sqrt(wps[-1, -1]) + + Args: + xs (np.ndarray): ref sequence, [T,D] + ys (np.ndarray): hyp sequence, [T,D] + + Returns: + float: dtw distance + """ + return dtw_ndim.distance(xs, ys) diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py new file mode 100644 index 00000000..281e5765 --- /dev/null +++ b/paddleaudio/paddleaudio/metric/mcd.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import mcd.metrics_fast as mt +from mcd import dtw + +__all__ = [ + 'mcd_distance', +] + + +def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist): + """Mel cepstral distortion (MCD), dtw distance. + + Dynamic Time Warping. + Uses dynamic programming to compute: + wps[i, j] = cost_fn(xs[i], ys[j]) + min( + wps[i-1, j ], // vertical / insertion / expansion + wps[i , j-1], // horizontal / deletion / compression + wps[i-1, j-1]) // diagonal / match + dtw = sqrt(wps[-1, -1]) + + Cost Function: + logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0) + def logSpecDbDist(x, y): + diff = x - y + return logSpecDbConst * math.sqrt(np.inner(diff, diff)) + + Args: + xs (np.ndarray): ref sequence, [T,D] + ys (np.ndarray): hyp sequence, [T,D] + + Returns: + float: dtw distance + """ + min_cost, path = dtw.dtw(xs, ys, cost_fn) + return min_cost diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py index 98bf8a6f..7623443a 100644 --- a/paddleaudio/setup.py +++ b/paddleaudio/setup.py @@ -59,6 +59,8 @@ setuptools.setup( 'resampy >= 0.2.2', 'soundfile >= 0.9.0', 'colorlog', + 'dtaidistance >= 2.3.6', + 'mcd >= 0.4', ], ) remove_version_py() -- GitLab