From 2642aab16be395721957aa9f6c3c2672ca1a1e19 Mon Sep 17 00:00:00 2001
From: Jeff Wang <wangjeff@baidu.com>
Date: Wed, 25 Apr 2018 17:24:57 -0700
Subject: [PATCH] Embedding doc (#424)

* Use a more light weight method to get PCA and T-SNE. Update embedding documentations. Update loading animation

* Reveal the embedding tab

* Add the auto reload. and fix typos

* Update comments
---
 frontend/src/common/component/AppMenu.vue     |  12 +-
 .../src/high-dimensional/HighDimensional.vue  |  30 ++-
 frontend/src/high-dimensional/ui/Chart.vue    |  16 +-
 frontend/src/high-dimensional/ui/Config.vue   |   7 +-
 visualdl/logic/pybind.cc                      |  15 +-
 visualdl/python/storage.py                    |  13 ++
 visualdl/server/lib.py                        |  40 +++-
 visualdl/server/tsne.py                       | 184 ++++++++++++++++++
 8 files changed, 289 insertions(+), 28 deletions(-)
 create mode 100644 visualdl/server/tsne.py
diff --git a/frontend/src/common/component/AppMenu.vue b/frontend/src/common/component/AppMenu.vue
index 98c60aed..c7b6b489 100644
--- a/frontend/src/common/component/AppMenu.vue
+++ b/frontend/src/common/component/AppMenu.vue
@@ -63,13 +63,11 @@ export default {
           title: 'TEXTS',
           name: 'texts',
         },
-        /* // Hide the top menu
-            {
-                url: '/HighDimensional',
-                title: 'HighDimensional',
-                name: 'HighDimensional'
-            }
-            */
+        {
+          url: '/HighDimensional',
+          title: 'HighDimensional',
+          name: 'HighDimensional',
+        },
       ],
     };
   },
diff --git a/frontend/src/high-dimensional/HighDimensional.vue b/frontend/src/high-dimensional/HighDimensional.vue
index 50872acd..431ddfeb 100644
--- a/frontend/src/high-dimensional/HighDimensional.vue
+++ b/frontend/src/high-dimensional/HighDimensional.vue
@@ -7,6 +7,7 @@
         :search-text="config.searchText"
         :dimension="config.dimension"
         :embedding-data="embeddingData"
+        :show-loading="showLoading"
       />
     </div>
     <div class="visual-dl-page-right">
@@ -26,6 +27,9 @@ import autoAdjustHeight from '../common/util/autoAdjustHeight';
 import Config from './ui/Config';
 import Chart from './ui/Chart';
 
+// the time to refresh chart data
+const intervalTime = 30;
+
 export default {
   components: {
     'ui-config': Config,
@@ -39,11 +43,12 @@ export default {
         searchText: '',
         displayWordLabel: true,
         dimension: '2',
-        reduction: 'tsne',
+        reduction: 'pca',
         selectedRun: '',
         running: true,
       },
       embeddingData: [],
+      showLoading: false,
     };
   },
   created() {
@@ -55,6 +60,13 @@ export default {
         this.config.selectedRun = data[0];
       }
     });
+
+    if (this.config.running) {
+      this.startInterval();
+    }
+  },
+  beforeDestroy() {
+    this.stopInterval();
   },
   watch: {
     'config.dimension': function(val) {
@@ -66,6 +78,9 @@ export default {
     'config.selectedRun': function(val) {
       this.fetchDatasets();
     },
+    'config.running': function(val) {
+      val ? this.startInterval() : this.stopInterval();
+    },
   },
   mounted() {
     autoAdjustHeight();
@@ -82,7 +97,18 @@ export default {
     },
   },
   methods: {
+    stopInterval() {
+      clearInterval(this.getOringDataInterval);
+    },
+    // get origin data per {{intervalTime}} seconds
+    startInterval() {
+      this.getOringDataInterval = setInterval(() => {
+        this.fetchDatasets();
+      }, intervalTime * 1000);
+    },
     fetchDatasets() {
+      this.showLoading = true;
+
       // Fetch the data from the server. Passing dimension and reduction method
       let params = {
         dimension: this.config.dimension,
@@ -90,6 +116,8 @@ export default {
         run: this.config.selectedRun,
       };
       getHighDimensionalDatasets(params).then(({errno, data}) => {
+        this.showLoading = false;
+
         let vectorData = data.embedding;
         let labels = data.labels;
 
diff --git a/frontend/src/high-dimensional/ui/Chart.vue b/frontend/src/high-dimensional/ui/Chart.vue
index 06b78283..9ef7b170 100644
--- a/frontend/src/high-dimensional/ui/Chart.vue
+++ b/frontend/src/high-dimensional/ui/Chart.vue
@@ -35,6 +35,10 @@ export default {
       type: String,
       required: true,
     },
+    showLoading: {
+      type: Boolean,
+      required: true,
+    },
   },
   data() {
     return {
@@ -53,15 +57,11 @@ export default {
   created() {},
   mounted() {
     this.createChart();
-    this.myChart.showLoading();
-
     this.set2DChartOptions();
     this.setDisplayWordLabel();
   },
   watch: {
     embeddingData: function(val) {
-      this.myChart.hideLoading();
-
       // Got new data, pass to the filter function to render the 'matched' set and 'not matched' set
       this.filterSeriesDataAndSetOption(this.searchText);
     },
@@ -70,7 +70,6 @@ export default {
     },
     dimension: function(val) {
       this.myChart.clear();
-      this.myChart.showLoading();
       if (val === '2') {
         this.set2DChartOptions();
         this.setDisplayWordLabel();
@@ -82,6 +81,13 @@ export default {
     searchText: function(val) {
       this.filterSeriesDataAndSetOption(val);
     },
+    showLoading: function(val) {
+      if (val) {
+        this.myChart.showLoading();
+      } else {
+        this.myChart.hideLoading();
+      }
+    },
   },
   methods: {
     createChart() {
diff --git a/frontend/src/high-dimensional/ui/Config.vue b/frontend/src/high-dimensional/ui/Config.vue
index e67f2cf0..b2dac074 100644
--- a/frontend/src/high-dimensional/ui/Config.vue
+++ b/frontend/src/high-dimensional/ui/Config.vue
@@ -31,12 +31,13 @@
       label="Reduction Method"
       v-model="config.reduction"
       dark>
-      <v-radio
-        label="T-SNE"
-        value="tsne"/>
       <v-radio
         label="PCA"
         value="pca"/>
+      <v-radio
+        label="T-SNE"
+        value="tsne"/>
+
     </v-radio-group>
 
     <v-radio-group
diff --git a/visualdl/logic/pybind.cc b/visualdl/logic/pybind.cc
index b0cc8cee..7139ccd2 100644
--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -253,10 +253,19 @@ PYBIND11_MODULE(core, m) {
       .def("total_records", &cp::TextReader::total_records)
       .def("size", &cp::TextReader::size);
 
-  py::class_<cp::Embedding>(m, "EmbeddingWriter")
+  py::class_<cp::Embedding>(m, "EmbeddingWriter", R"pbdoc(
+        PyBind class. Must instantiate through the LogWriter.
+      )pbdoc")
       .def("set_caption", &cp::Embedding::SetCaption)
-      .def("add_embeddings_with_word_list",
-           &cp::Embedding::AddEmbeddingsWithWordList);
+      .def(
+          "add_embeddings_with_word_list"
+          R"pbdoc(
+            Add embedding record. Each run can only store one embedding data.
+
+            :param embedding: hot vector of embedding words
+            :type embedding: list
+          )pbdoc",
+          &cp::Embedding::AddEmbeddingsWithWordList);
 
   py::class_<cp::EmbeddingReader>(m, "EmbeddingReader")
       .def("get_all_labels", &cp::EmbeddingReader::get_all_labels)
diff --git a/visualdl/python/storage.py b/visualdl/python/storage.py
index b22cb04a..9eced0d0 100644
--- a/visualdl/python/storage.py
+++ b/visualdl/python/storage.py
@@ -143,6 +143,9 @@ class LogReader(object):
         return self.reader.get_text(tag)
 
     def embedding(self):
+        """
+        Get the embedding reader.
+        """
         return self.reader.get_embedding(EMBEDDING_TAG)
 
     def audio(self, tag):
@@ -292,9 +295,19 @@ class LogWriter(object):
         return self.writer.new_text(tag)
 
     def embedding(self):
+        """
+        Create an embedding writer that is used to write
+        embedding data.
+
+        :return: An embedding writer to record embedding data
+        :rtype: embeddingWriter
+        """
         return self.writer.new_embedding(EMBEDDING_TAG)
 
     def save(self):
+        """
+        Force the VisualDL to sync with the file system.
+        """
         self.writer.save()
 
     def __enter__(self):
diff --git a/visualdl/server/lib.py b/visualdl/server/lib.py
index edc4c498..ddb25b07 100644
--- a/visualdl/server/lib.py
+++ b/visualdl/server/lib.py
@@ -307,19 +307,18 @@ def get_embeddings(storage, mode, reduction, dimension=2, num_records=5000):
     with storage.mode(mode) as reader:
         embedding = reader.embedding()
         labels = embedding.get_all_labels()
-        high_dimensional_vectors = embedding.get_all_embeddings()
+        high_dimensional_vectors = np.array(embedding.get_all_embeddings())
 
-        # TODO: Move away from sklearn
         if reduction == 'tsne':
-            from sklearn.manifold import TSNE
-            tsne = TSNE(
-                perplexity=30, n_components=dimension, init='pca', n_iter=5000)
-            low_dim_embs = tsne.fit_transform(high_dimensional_vectors)
+            import tsne
+            low_dim_embs = tsne.tsne(
+                high_dimensional_vectors,
+                dimension,
+                initial_dims=50,
+                perplexity=30.0)
 
         elif reduction == 'pca':
-            from sklearn.decomposition import PCA
-            pca = PCA(n_components=3)
-            low_dim_embs = pca.fit_transform(high_dimensional_vectors)
+            low_dim_embs = simple_pca(high_dimensional_vectors, dimension)
 
         return {"embedding": low_dim_embs.tolist(), "labels": labels}
 
@@ -393,3 +392,26 @@ def cache_get(cache):
         return data
 
     return _handler
+
+
+def simple_pca(x, dimension):
+    """
+    A simple PCA implementation to do the dimension reduction.
+    """
+
+    # Center the data.
+    x -= np.mean(x, axis=0)
+
+    # Computing the Covariance Matrix
+    cov = np.cov(x, rowvar=False)
+
+    # Get eigenvectors and eigenvalues from the covariance matrix
+    eigvals, eigvecs = np.linalg.eig(cov)
+
+    # Sort the eigvals from high to low
+    order = np.argsort(eigvals)[::-1]
+
+    # Drop the eigenvectors with low eigenvalues
+    eigvecs = eigvecs[:, order[:dimension]]
+
+    return np.dot(x, eigvecs)
diff --git a/visualdl/server/tsne.py b/visualdl/server/tsne.py
new file mode 100644
index 00000000..9986c9ed
--- /dev/null
+++ b/visualdl/server/tsne.py
@@ -0,0 +1,184 @@
+#
+#  tsne.py
+#
+# Implementation of t-SNE in Python. The implementation was tested on Python
+# 2.7.10, and it requires a working installation of NumPy. The implementation
+# comes with an example on the MNIST dataset. In order to plot the
+# results of this example, a working installation of matplotlib is required.
+#
+# The example can be run by executing: `ipython tsne.py`
+#
+#
+#  Created by Laurens van der Maaten on 20-12-08.
+#  Copyright (c) 2008 Tilburg University. All rights reserved.
+
+# Editor's note: Thanks https://lvdmaaten.github.io/tsne/
+# For allowing it to be free to use, modify, or redistribute for non-commercial purposes
+# This file is modified to remove some print out.
+
+import numpy as np
+
+
+def Hbeta(D=np.array([]), beta=1.0):
+    """
+        Compute the perplexity and the P-row for a specific value of the
+        precision of a Gaussian distribution.
+    """
+
+    # Compute P-row and corresponding perplexity
+    P = np.exp(-D.copy() * beta)
+    sumP = sum(P)
+    H = np.log(sumP) + beta * np.sum(D * P) / sumP
+    P = P / sumP
+    return H, P
+
+
+def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
+    """
+        Performs a binary search to get P-values in such a way that each
+        conditional Gaussian has the same perplexity.
+    """
+
+    # Initialize some variables
+    print("Computing pairwise distances...")
+    (n, d) = X.shape
+    sum_X = np.sum(np.square(X), 1)
+    D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
+    P = np.zeros((n, n))
+    beta = np.ones((n, 1))
+    logU = np.log(perplexity)
+
+    # Loop over all datapoints
+    for i in range(n):
+
+        # Print progress
+        # if i % 500 == 0:
+        #     print("Computing P-values for point %d of %d..." % (i, n))
+
+        # Compute the Gaussian kernel and entropy for the current precision
+        betamin = -np.inf
+        betamax = np.inf
+        Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))]
+        (H, thisP) = Hbeta(Di, beta[i])
+
+        # Evaluate whether the perplexity is within tolerance
+        Hdiff = H - logU
+        tries = 0
+        while np.abs(Hdiff) > tol and tries < 50:
+
+            # If not, increase or decrease precision
+            if Hdiff > 0:
+                betamin = beta[i].copy()
+                if betamax == np.inf or betamax == -np.inf:
+                    beta[i] = beta[i] * 2.
+                else:
+                    beta[i] = (beta[i] + betamax) / 2.
+            else:
+                betamax = beta[i].copy()
+                if betamin == np.inf or betamin == -np.inf:
+                    beta[i] = beta[i] / 2.
+                else:
+                    beta[i] = (beta[i] + betamin) / 2.
+
+            # Recompute the values
+            (H, thisP) = Hbeta(Di, beta[i])
+            Hdiff = H - logU
+            tries += 1
+
+        # Set the final row of P
+        P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP
+
+    # Return final P-matrix
+    print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
+    return P
+
+
+def pca(X=np.array([]), no_dims=50):
+    """
+        Runs PCA on the NxD array X in order to reduce its dimensionality to
+        no_dims dimensions.
+    """
+
+    (n, d) = X.shape
+    X = X - np.tile(np.mean(X, 0), (n, 1))
+    (l, M) = np.linalg.eig(np.dot(X.T, X))
+    Y = np.dot(X, M[:, 0:no_dims])
+    return Y
+
+
+def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
+    """
+        Runs t-SNE on the dataset in the NxD array X to reduce its
+        dimensionality to no_dims dimensions. The syntaxis of the function is
+        `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
+    """
+
+    # Check inputs
+    if isinstance(no_dims, float):
+        print("Error: array X should have type float.")
+        return -1
+    if round(no_dims) != no_dims:
+        print("Error: number of dimensions should be an integer.")
+        return -1
+
+    # Initialize variables
+    X = pca(X, initial_dims).real
+    (n, d) = X.shape
+    max_iter = 1000
+    initial_momentum = 0.5
+    final_momentum = 0.8
+    eta = 500
+    min_gain = 0.01
+    Y = np.random.randn(n, no_dims)
+    dY = np.zeros((n, no_dims))
+    iY = np.zeros((n, no_dims))
+    gains = np.ones((n, no_dims))
+
+    # Compute P-values
+    P = x2p(X, 1e-5, perplexity)
+    P = P + np.transpose(P)
+    P = P / np.sum(P)
+    P = P * 4.  # early exaggeration
+    P = np.maximum(P, 1e-12)
+
+    # Run iterations
+    for iter in range(max_iter):
+
+        # Compute pairwise affinities
+        sum_Y = np.sum(np.square(Y), 1)
+        num = -2. * np.dot(Y, Y.T)
+        num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
+        num[range(n), range(n)] = 0.
+        Q = num / np.sum(num)
+        Q = np.maximum(Q, 1e-12)
+
+        # Compute gradient
+        PQ = P - Q
+        for i in range(n):
+            dY[i, :] = np.sum(
+                np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y),
+                0)
+
+        # Perform the update
+        if iter < 20:
+            momentum = initial_momentum
+        else:
+            momentum = final_momentum
+        gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
+                (gains * 0.8) * ((dY > 0.) == (iY > 0.))
+        gains[gains < min_gain] = min_gain
+        iY = momentum * iY - eta * (gains * dY)
+        Y = Y + iY
+        Y = Y - np.tile(np.mean(Y, 0), (n, 1))
+
+        # Compute current value of cost function
+        # if (iter + 1) % 10 == 0:
+        #     C = np.sum(P * np.log(P / Q))
+        # print("Iteration %d: error is %f" % (iter + 1, C))
+
+        # Stop lying about P-values
+        if iter == 100:
+            P = P / 4.
+
+    # Return solution
+    return Y
-- 
GitLab