From 2642aab16be395721957aa9f6c3c2672ca1a1e19 Mon Sep 17 00:00:00 2001 From: Jeff Wang Date: Wed, 25 Apr 2018 17:24:57 -0700 Subject: [PATCH] Embedding doc (#424) * Use a more light weight method to get PCA and T-SNE. Update embedding documentations. Update loading animation * Reveal the embedding tab * Add the auto reload. and fix typos * Update comments --- frontend/src/common/component/AppMenu.vue | 12 +- .../src/high-dimensional/HighDimensional.vue | 30 ++- frontend/src/high-dimensional/ui/Chart.vue | 16 +- frontend/src/high-dimensional/ui/Config.vue | 7 +- visualdl/logic/pybind.cc | 15 +- visualdl/python/storage.py | 13 ++ visualdl/server/lib.py | 40 +++- visualdl/server/tsne.py | 184 ++++++++++++++++++ 8 files changed, 289 insertions(+), 28 deletions(-) create mode 100644 visualdl/server/tsne.py diff --git a/frontend/src/common/component/AppMenu.vue b/frontend/src/common/component/AppMenu.vue index 98c60aed..c7b6b489 100644 --- a/frontend/src/common/component/AppMenu.vue +++ b/frontend/src/common/component/AppMenu.vue @@ -63,13 +63,11 @@ export default { title: 'TEXTS', name: 'texts', }, - /* // Hide the top menu - { - url: '/HighDimensional', - title: 'HighDimensional', - name: 'HighDimensional' - } - */ + { + url: '/HighDimensional', + title: 'HighDimensional', + name: 'HighDimensional', + }, ], }; }, diff --git a/frontend/src/high-dimensional/HighDimensional.vue b/frontend/src/high-dimensional/HighDimensional.vue index 50872acd..431ddfeb 100644 --- a/frontend/src/high-dimensional/HighDimensional.vue +++ b/frontend/src/high-dimensional/HighDimensional.vue @@ -7,6 +7,7 @@ :search-text="config.searchText" :dimension="config.dimension" :embedding-data="embeddingData" + :show-loading="showLoading" />
@@ -26,6 +27,9 @@ import autoAdjustHeight from '../common/util/autoAdjustHeight'; import Config from './ui/Config'; import Chart from './ui/Chart'; +// the time to refresh chart data +const intervalTime = 30; + export default { components: { 'ui-config': Config, @@ -39,11 +43,12 @@ export default { searchText: '', displayWordLabel: true, dimension: '2', - reduction: 'tsne', + reduction: 'pca', selectedRun: '', running: true, }, embeddingData: [], + showLoading: false, }; }, created() { @@ -55,6 +60,13 @@ export default { this.config.selectedRun = data[0]; } }); + + if (this.config.running) { + this.startInterval(); + } + }, + beforeDestroy() { + this.stopInterval(); }, watch: { 'config.dimension': function(val) { @@ -66,6 +78,9 @@ export default { 'config.selectedRun': function(val) { this.fetchDatasets(); }, + 'config.running': function(val) { + val ? this.startInterval() : this.stopInterval(); + }, }, mounted() { autoAdjustHeight(); @@ -82,7 +97,18 @@ export default { }, }, methods: { + stopInterval() { + clearInterval(this.getOringDataInterval); + }, + // get origin data per {{intervalTime}} seconds + startInterval() { + this.getOringDataInterval = setInterval(() => { + this.fetchDatasets(); + }, intervalTime * 1000); + }, fetchDatasets() { + this.showLoading = true; + // Fetch the data from the server. Passing dimension and reduction method let params = { dimension: this.config.dimension, @@ -90,6 +116,8 @@ export default { run: this.config.selectedRun, }; getHighDimensionalDatasets(params).then(({errno, data}) => { + this.showLoading = false; + let vectorData = data.embedding; let labels = data.labels; diff --git a/frontend/src/high-dimensional/ui/Chart.vue b/frontend/src/high-dimensional/ui/Chart.vue index 06b78283..9ef7b170 100644 --- a/frontend/src/high-dimensional/ui/Chart.vue +++ b/frontend/src/high-dimensional/ui/Chart.vue @@ -35,6 +35,10 @@ export default { type: String, required: true, }, + showLoading: { + type: Boolean, + required: true, + }, }, data() { return { @@ -53,15 +57,11 @@ export default { created() {}, mounted() { this.createChart(); - this.myChart.showLoading(); - this.set2DChartOptions(); this.setDisplayWordLabel(); }, watch: { embeddingData: function(val) { - this.myChart.hideLoading(); - // Got new data, pass to the filter function to render the 'matched' set and 'not matched' set this.filterSeriesDataAndSetOption(this.searchText); }, @@ -70,7 +70,6 @@ export default { }, dimension: function(val) { this.myChart.clear(); - this.myChart.showLoading(); if (val === '2') { this.set2DChartOptions(); this.setDisplayWordLabel(); @@ -82,6 +81,13 @@ export default { searchText: function(val) { this.filterSeriesDataAndSetOption(val); }, + showLoading: function(val) { + if (val) { + this.myChart.showLoading(); + } else { + this.myChart.hideLoading(); + } + }, }, methods: { createChart() { diff --git a/frontend/src/high-dimensional/ui/Config.vue b/frontend/src/high-dimensional/ui/Config.vue index e67f2cf0..b2dac074 100644 --- a/frontend/src/high-dimensional/ui/Config.vue +++ b/frontend/src/high-dimensional/ui/Config.vue @@ -31,12 +31,13 @@ label="Reduction Method" v-model="config.reduction" dark> - + + (m, "EmbeddingWriter") + py::class_(m, "EmbeddingWriter", R"pbdoc( + PyBind class. Must instantiate through the LogWriter. + )pbdoc") .def("set_caption", &cp::Embedding::SetCaption) - .def("add_embeddings_with_word_list", - &cp::Embedding::AddEmbeddingsWithWordList); + .def( + "add_embeddings_with_word_list" + R"pbdoc( + Add embedding record. Each run can only store one embedding data. + + :param embedding: hot vector of embedding words + :type embedding: list + )pbdoc", + &cp::Embedding::AddEmbeddingsWithWordList); py::class_(m, "EmbeddingReader") .def("get_all_labels", &cp::EmbeddingReader::get_all_labels) diff --git a/visualdl/python/storage.py b/visualdl/python/storage.py index b22cb04a..9eced0d0 100644 --- a/visualdl/python/storage.py +++ b/visualdl/python/storage.py @@ -143,6 +143,9 @@ class LogReader(object): return self.reader.get_text(tag) def embedding(self): + """ + Get the embedding reader. + """ return self.reader.get_embedding(EMBEDDING_TAG) def audio(self, tag): @@ -292,9 +295,19 @@ class LogWriter(object): return self.writer.new_text(tag) def embedding(self): + """ + Create an embedding writer that is used to write + embedding data. + + :return: An embedding writer to record embedding data + :rtype: embeddingWriter + """ return self.writer.new_embedding(EMBEDDING_TAG) def save(self): + """ + Force the VisualDL to sync with the file system. + """ self.writer.save() def __enter__(self): diff --git a/visualdl/server/lib.py b/visualdl/server/lib.py index edc4c498..ddb25b07 100644 --- a/visualdl/server/lib.py +++ b/visualdl/server/lib.py @@ -307,19 +307,18 @@ def get_embeddings(storage, mode, reduction, dimension=2, num_records=5000): with storage.mode(mode) as reader: embedding = reader.embedding() labels = embedding.get_all_labels() - high_dimensional_vectors = embedding.get_all_embeddings() + high_dimensional_vectors = np.array(embedding.get_all_embeddings()) - # TODO: Move away from sklearn if reduction == 'tsne': - from sklearn.manifold import TSNE - tsne = TSNE( - perplexity=30, n_components=dimension, init='pca', n_iter=5000) - low_dim_embs = tsne.fit_transform(high_dimensional_vectors) + import tsne + low_dim_embs = tsne.tsne( + high_dimensional_vectors, + dimension, + initial_dims=50, + perplexity=30.0) elif reduction == 'pca': - from sklearn.decomposition import PCA - pca = PCA(n_components=3) - low_dim_embs = pca.fit_transform(high_dimensional_vectors) + low_dim_embs = simple_pca(high_dimensional_vectors, dimension) return {"embedding": low_dim_embs.tolist(), "labels": labels} @@ -393,3 +392,26 @@ def cache_get(cache): return data return _handler + + +def simple_pca(x, dimension): + """ + A simple PCA implementation to do the dimension reduction. + """ + + # Center the data. + x -= np.mean(x, axis=0) + + # Computing the Covariance Matrix + cov = np.cov(x, rowvar=False) + + # Get eigenvectors and eigenvalues from the covariance matrix + eigvals, eigvecs = np.linalg.eig(cov) + + # Sort the eigvals from high to low + order = np.argsort(eigvals)[::-1] + + # Drop the eigenvectors with low eigenvalues + eigvecs = eigvecs[:, order[:dimension]] + + return np.dot(x, eigvecs) diff --git a/visualdl/server/tsne.py b/visualdl/server/tsne.py new file mode 100644 index 00000000..9986c9ed --- /dev/null +++ b/visualdl/server/tsne.py @@ -0,0 +1,184 @@ +# +# tsne.py +# +# Implementation of t-SNE in Python. The implementation was tested on Python +# 2.7.10, and it requires a working installation of NumPy. The implementation +# comes with an example on the MNIST dataset. In order to plot the +# results of this example, a working installation of matplotlib is required. +# +# The example can be run by executing: `ipython tsne.py` +# +# +# Created by Laurens van der Maaten on 20-12-08. +# Copyright (c) 2008 Tilburg University. All rights reserved. + +# Editor's note: Thanks https://lvdmaaten.github.io/tsne/ +# For allowing it to be free to use, modify, or redistribute for non-commercial purposes +# This file is modified to remove some print out. + +import numpy as np + + +def Hbeta(D=np.array([]), beta=1.0): + """ + Compute the perplexity and the P-row for a specific value of the + precision of a Gaussian distribution. + """ + + # Compute P-row and corresponding perplexity + P = np.exp(-D.copy() * beta) + sumP = sum(P) + H = np.log(sumP) + beta * np.sum(D * P) / sumP + P = P / sumP + return H, P + + +def x2p(X=np.array([]), tol=1e-5, perplexity=30.0): + """ + Performs a binary search to get P-values in such a way that each + conditional Gaussian has the same perplexity. + """ + + # Initialize some variables + print("Computing pairwise distances...") + (n, d) = X.shape + sum_X = np.sum(np.square(X), 1) + D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X) + P = np.zeros((n, n)) + beta = np.ones((n, 1)) + logU = np.log(perplexity) + + # Loop over all datapoints + for i in range(n): + + # Print progress + # if i % 500 == 0: + # print("Computing P-values for point %d of %d..." % (i, n)) + + # Compute the Gaussian kernel and entropy for the current precision + betamin = -np.inf + betamax = np.inf + Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] + (H, thisP) = Hbeta(Di, beta[i]) + + # Evaluate whether the perplexity is within tolerance + Hdiff = H - logU + tries = 0 + while np.abs(Hdiff) > tol and tries < 50: + + # If not, increase or decrease precision + if Hdiff > 0: + betamin = beta[i].copy() + if betamax == np.inf or betamax == -np.inf: + beta[i] = beta[i] * 2. + else: + beta[i] = (beta[i] + betamax) / 2. + else: + betamax = beta[i].copy() + if betamin == np.inf or betamin == -np.inf: + beta[i] = beta[i] / 2. + else: + beta[i] = (beta[i] + betamin) / 2. + + # Recompute the values + (H, thisP) = Hbeta(Di, beta[i]) + Hdiff = H - logU + tries += 1 + + # Set the final row of P + P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP + + # Return final P-matrix + print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta))) + return P + + +def pca(X=np.array([]), no_dims=50): + """ + Runs PCA on the NxD array X in order to reduce its dimensionality to + no_dims dimensions. + """ + + (n, d) = X.shape + X = X - np.tile(np.mean(X, 0), (n, 1)) + (l, M) = np.linalg.eig(np.dot(X.T, X)) + Y = np.dot(X, M[:, 0:no_dims]) + return Y + + +def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0): + """ + Runs t-SNE on the dataset in the NxD array X to reduce its + dimensionality to no_dims dimensions. The syntaxis of the function is + `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array. + """ + + # Check inputs + if isinstance(no_dims, float): + print("Error: array X should have type float.") + return -1 + if round(no_dims) != no_dims: + print("Error: number of dimensions should be an integer.") + return -1 + + # Initialize variables + X = pca(X, initial_dims).real + (n, d) = X.shape + max_iter = 1000 + initial_momentum = 0.5 + final_momentum = 0.8 + eta = 500 + min_gain = 0.01 + Y = np.random.randn(n, no_dims) + dY = np.zeros((n, no_dims)) + iY = np.zeros((n, no_dims)) + gains = np.ones((n, no_dims)) + + # Compute P-values + P = x2p(X, 1e-5, perplexity) + P = P + np.transpose(P) + P = P / np.sum(P) + P = P * 4. # early exaggeration + P = np.maximum(P, 1e-12) + + # Run iterations + for iter in range(max_iter): + + # Compute pairwise affinities + sum_Y = np.sum(np.square(Y), 1) + num = -2. * np.dot(Y, Y.T) + num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y)) + num[range(n), range(n)] = 0. + Q = num / np.sum(num) + Q = np.maximum(Q, 1e-12) + + # Compute gradient + PQ = P - Q + for i in range(n): + dY[i, :] = np.sum( + np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y), + 0) + + # Perform the update + if iter < 20: + momentum = initial_momentum + else: + momentum = final_momentum + gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \ + (gains * 0.8) * ((dY > 0.) == (iY > 0.)) + gains[gains < min_gain] = min_gain + iY = momentum * iY - eta * (gains * dY) + Y = Y + iY + Y = Y - np.tile(np.mean(Y, 0), (n, 1)) + + # Compute current value of cost function + # if (iter + 1) % 10 == 0: + # C = np.sum(P * np.log(P / Q)) + # print("Iteration %d: error is %f" % (iter + 1, C)) + + # Stop lying about P-values + if iter == 100: + P = P / 4. + + # Return solution + return Y -- GitLab