Embedding doc (#424)

* Use a more light weight method to get PCA and T-SNE. Update embedding documentations. Update loading animation * Reveal the embedding tab * Add the auto reload. and fix typos * Update comments

Embedding doc (#424)
* Use a more light weight method to get PCA and T-SNE. Update embedding documentations. Update loading animation * Reveal the embedding tab * Add the auto reload. and fix typos * Update comments
2642aab1 · Jeff Wang · GitHub · c1c2232c · 2642aab1 · 2642aab1
8 changed file
--- a/frontend/src/common/component/AppMenu.vue
+++ b/frontend/src/common/component/AppMenu.vue
@@ -63,13 +63,11 @@ export default {
          title: 'TEXTS',
          name: 'texts',
        },
-        /* // Hide the top menu
        {
          url: '/HighDimensional',
          title: 'HighDimensional',
-                name: 'HighDimensional'
-            }
-            */
+          name: 'HighDimensional',
+        },
      ],
    };
  },

--- a/frontend/src/high-dimensional/HighDimensional.vue
+++ b/frontend/src/high-dimensional/HighDimensional.vue
@@ -7,6 +7,7 @@
        :search-text="config.searchText"
        :dimension="config.dimension"
        :embedding-data="embeddingData"
+        :show-loading="showLoading"
      />
    </div>
    <div class="visual-dl-page-right">
@@ -26,6 +27,9 @@ import autoAdjustHeight from '../common/util/autoAdjustHeight';
 import Config from './ui/Config';
 import Chart from './ui/Chart';

+// the time to refresh chart data
+const intervalTime = 30;
+
 export default {
  components: {
    'ui-config': Config,
@@ -39,11 +43,12 @@ export default {
        searchText: '',
        displayWordLabel: true,
        dimension: '2',
-        reduction: 'tsne',
+        reduction: 'pca',
        selectedRun: '',
        running: true,
      },
      embeddingData: [],
+      showLoading: false,
    };
  },
  created() {
@@ -55,6 +60,13 @@ export default {
        this.config.selectedRun = data[0];
      }
    });
+
+    if (this.config.running) {
+      this.startInterval();
+    }
+  },
+  beforeDestroy() {
+    this.stopInterval();
  },
  watch: {
    'config.dimension': function(val) {
@@ -66,6 +78,9 @@ export default {
    'config.selectedRun': function(val) {
      this.fetchDatasets();
    },
+    'config.running': function(val) {
+      val ? this.startInterval() : this.stopInterval();
+    },
  },
  mounted() {
    autoAdjustHeight();
@@ -82,7 +97,18 @@ export default {
    },
  },
  methods: {
+    stopInterval() {
+      clearInterval(this.getOringDataInterval);
+    },
+    // get origin data per {{intervalTime}} seconds
+    startInterval() {
+      this.getOringDataInterval = setInterval(() => {
+        this.fetchDatasets();
+      }, intervalTime * 1000);
+    },
    fetchDatasets() {
+      this.showLoading = true;
+
      // Fetch the data from the server. Passing dimension and reduction method
      let params = {
        dimension: this.config.dimension,
@@ -90,6 +116,8 @@ export default {
        run: this.config.selectedRun,
      };
      getHighDimensionalDatasets(params).then(({errno, data}) => {
+        this.showLoading = false;
+
        let vectorData = data.embedding;
        let labels = data.labels;


--- a/frontend/src/high-dimensional/ui/Chart.vue
+++ b/frontend/src/high-dimensional/ui/Chart.vue
@@ -35,6 +35,10 @@ export default {
      type: String,
      required: true,
    },
+    showLoading: {
+      type: Boolean,
+      required: true,
+    },
  },
  data() {
    return {
@@ -53,15 +57,11 @@ export default {
  created() {},
  mounted() {
    this.createChart();
-    this.myChart.showLoading();
-
    this.set2DChartOptions();
    this.setDisplayWordLabel();
  },
  watch: {
    embeddingData: function(val) {
-      this.myChart.hideLoading();
-
      // Got new data, pass to the filter function to render the 'matched' set and 'not matched' set
      this.filterSeriesDataAndSetOption(this.searchText);
    },
@@ -70,7 +70,6 @@ export default {
    },
    dimension: function(val) {
      this.myChart.clear();
-      this.myChart.showLoading();
      if (val === '2') {
        this.set2DChartOptions();
        this.setDisplayWordLabel();
@@ -82,6 +81,13 @@ export default {
    searchText: function(val) {
      this.filterSeriesDataAndSetOption(val);
    },
+    showLoading: function(val) {
+      if (val) {
+        this.myChart.showLoading();
+      } else {
+        this.myChart.hideLoading();
+      }
+    },
  },
  methods: {
    createChart() {

--- a/frontend/src/high-dimensional/ui/Config.vue
+++ b/frontend/src/high-dimensional/ui/Config.vue
@@ -31,12 +31,13 @@
      label="Reduction Method"
      v-model="config.reduction"
      dark>
-      <v-radio
-        label="T-SNE"
-        value="tsne"/>
      <v-radio
        label="PCA"
        value="pca"/>
+      <v-radio
+        label="T-SNE"
+        value="tsne"/>
+
    </v-radio-group>

    <v-radio-group

--- a/visualdl/logic/pybind.cc
+++ b/visualdl/logic/pybind.cc
@@ -253,9 +253,18 @@ PYBIND11_MODULE(core, m) {
      .def("total_records", &cp::TextReader::total_records)
      .def("size", &cp::TextReader::size);

-  py::class_<cp::Embedding>(m, "EmbeddingWriter")
+  py::class_<cp::Embedding>(m, "EmbeddingWriter", R"pbdoc(
+        PyBind class. Must instantiate through the LogWriter.
+      )pbdoc")
      .def("set_caption", &cp::Embedding::SetCaption)
-      .def("add_embeddings_with_word_list",
+      .def(
+          "add_embeddings_with_word_list"
+          R"pbdoc(
+            Add embedding record. Each run can only store one embedding data.
+
+            :param embedding: hot vector of embedding words
+            :type embedding: list
+          )pbdoc",
          &cp::Embedding::AddEmbeddingsWithWordList);

  py::class_<cp::EmbeddingReader>(m, "EmbeddingReader")

--- a/visualdl/python/storage.py
+++ b/visualdl/python/storage.py
@@ -143,6 +143,9 @@ class LogReader(object):
        return self.reader.get_text(tag)

    def embedding(self):
+        """
+        Get the embedding reader.
+        """
        return self.reader.get_embedding(EMBEDDING_TAG)

    def audio(self, tag):
@@ -292,9 +295,19 @@ class LogWriter(object):
        return self.writer.new_text(tag)

    def embedding(self):
+        """
+        Create an embedding writer that is used to write
+        embedding data.
+
+        :return: An embedding writer to record embedding data
+        :rtype: embeddingWriter
+        """
        return self.writer.new_embedding(EMBEDDING_TAG)

    def save(self):
+        """
+        Force the VisualDL to sync with the file system.
+        """
        self.writer.save()

    def __enter__(self):

--- a/visualdl/server/lib.py
+++ b/visualdl/server/lib.py
@@ -307,19 +307,18 @@ def get_embeddings(storage, mode, reduction, dimension=2, num_records=5000):
    with storage.mode(mode) as reader:
        embedding = reader.embedding()
        labels = embedding.get_all_labels()
-        high_dimensional_vectors = embedding.get_all_embeddings()
+        high_dimensional_vectors = np.array(embedding.get_all_embeddings())

-        # TODO: Move away from sklearn
        if reduction == 'tsne':
-            from sklearn.manifold import TSNE
-            tsne = TSNE(
-                perplexity=30, n_components=dimension, init='pca', n_iter=5000)
-            low_dim_embs = tsne.fit_transform(high_dimensional_vectors)
+            import tsne
+            low_dim_embs = tsne.tsne(
+                high_dimensional_vectors,
+                dimension,
+                initial_dims=50,
+                perplexity=30.0)

        elif reduction == 'pca':
-            from sklearn.decomposition import PCA
-            pca = PCA(n_components=3)
-            low_dim_embs = pca.fit_transform(high_dimensional_vectors)
+            low_dim_embs = simple_pca(high_dimensional_vectors, dimension)

        return {"embedding": low_dim_embs.tolist(), "labels": labels}

@@ -393,3 +392,26 @@ def cache_get(cache):
        return data

    return _handler
+
+
+def simple_pca(x, dimension):
+    """
+    A simple PCA implementation to do the dimension reduction.
+    """
+
+    # Center the data.
+    x -= np.mean(x, axis=0)
+
+    # Computing the Covariance Matrix
+    cov = np.cov(x, rowvar=False)
+
+    # Get eigenvectors and eigenvalues from the covariance matrix
+    eigvals, eigvecs = np.linalg.eig(cov)
+
+    # Sort the eigvals from high to low
+    order = np.argsort(eigvals)[::-1]
+
+    # Drop the eigenvectors with low eigenvalues
+    eigvecs = eigvecs[:, order[:dimension]]
+
+    return np.dot(x, eigvecs)
--- a/visualdl/server/tsne.py
+++ b/visualdl/server/tsne.py
+#
+#  tsne.py
+#
+# Implementation of t-SNE in Python. The implementation was tested on Python
+# 2.7.10, and it requires a working installation of NumPy. The implementation
+# comes with an example on the MNIST dataset. In order to plot the
+# results of this example, a working installation of matplotlib is required.
+#
+# The example can be run by executing: `ipython tsne.py`
+#
+#
+#  Created by Laurens van der Maaten on 20-12-08.
+#  Copyright (c) 2008 Tilburg University. All rights reserved.
+
+# Editor's note: Thanks https://lvdmaaten.github.io/tsne/
+# For allowing it to be free to use, modify, or redistribute for non-commercial purposes
+# This file is modified to remove some print out.
+
+import numpy as np
+
+
+def Hbeta(D=np.array([]), beta=1.0):
+    """
+        Compute the perplexity and the P-row for a specific value of the
+        precision of a Gaussian distribution.
+    """
+
+    # Compute P-row and corresponding perplexity
+    P = np.exp(-D.copy() * beta)
+    sumP = sum(P)
+    H = np.log(sumP) + beta * np.sum(D * P) / sumP
+    P = P / sumP
+    return H, P
+
+
+def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
+    """
+        Performs a binary search to get P-values in such a way that each
+        conditional Gaussian has the same perplexity.
+    """
+
+    # Initialize some variables
+    print("Computing pairwise distances...")
+    (n, d) = X.shape
+    sum_X = np.sum(np.square(X), 1)
+    D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
+    P = np.zeros((n, n))
+    beta = np.ones((n, 1))
+    logU = np.log(perplexity)
+
+    # Loop over all datapoints
+    for i in range(n):
+
+        # Print progress
+        # if i % 500 == 0:
+        #     print("Computing P-values for point %d of %d..." % (i, n))
+
+        # Compute the Gaussian kernel and entropy for the current precision
+        betamin = -np.inf
+        betamax = np.inf
+        Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))]
+        (H, thisP) = Hbeta(Di, beta[i])
+
+        # Evaluate whether the perplexity is within tolerance
+        Hdiff = H - logU
+        tries = 0
+        while np.abs(Hdiff) > tol and tries < 50:
+
+            # If not, increase or decrease precision
+            if Hdiff > 0:
+                betamin = beta[i].copy()
+                if betamax == np.inf or betamax == -np.inf:
+                    beta[i] = beta[i] * 2.
+                else:
+                    beta[i] = (beta[i] + betamax) / 2.
+            else:
+                betamax = beta[i].copy()
+                if betamin == np.inf or betamin == -np.inf:
+                    beta[i] = beta[i] / 2.
+                else:
+                    beta[i] = (beta[i] + betamin) / 2.
+
+            # Recompute the values
+            (H, thisP) = Hbeta(Di, beta[i])
+            Hdiff = H - logU
+            tries += 1
+
+        # Set the final row of P
+        P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP
+
+    # Return final P-matrix
+    print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
+    return P
+
+
+def pca(X=np.array([]), no_dims=50):
+    """
+        Runs PCA on the NxD array X in order to reduce its dimensionality to
+        no_dims dimensions.
+    """
+
+    (n, d) = X.shape
+    X = X - np.tile(np.mean(X, 0), (n, 1))
+    (l, M) = np.linalg.eig(np.dot(X.T, X))
+    Y = np.dot(X, M[:, 0:no_dims])
+    return Y
+
+
+def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
+    """
+        Runs t-SNE on the dataset in the NxD array X to reduce its
+        dimensionality to no_dims dimensions. The syntaxis of the function is
+        `Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
+    """
+
+    # Check inputs
+    if isinstance(no_dims, float):
+        print("Error: array X should have type float.")
+        return -1
+    if round(no_dims) != no_dims:
+        print("Error: number of dimensions should be an integer.")
+        return -1
+
+    # Initialize variables
+    X = pca(X, initial_dims).real
+    (n, d) = X.shape
+    max_iter = 1000
+    initial_momentum = 0.5
+    final_momentum = 0.8
+    eta = 500
+    min_gain = 0.01
+    Y = np.random.randn(n, no_dims)
+    dY = np.zeros((n, no_dims))
+    iY = np.zeros((n, no_dims))
+    gains = np.ones((n, no_dims))
+
+    # Compute P-values
+    P = x2p(X, 1e-5, perplexity)
+    P = P + np.transpose(P)
+    P = P / np.sum(P)
+    P = P * 4.  # early exaggeration
+    P = np.maximum(P, 1e-12)
+
+    # Run iterations
+    for iter in range(max_iter):
+
+        # Compute pairwise affinities
+        sum_Y = np.sum(np.square(Y), 1)
+        num = -2. * np.dot(Y, Y.T)
+        num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
+        num[range(n), range(n)] = 0.
+        Q = num / np.sum(num)
+        Q = np.maximum(Q, 1e-12)
+
+        # Compute gradient
+        PQ = P - Q
+        for i in range(n):
+            dY[i, :] = np.sum(
+                np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y),
+                0)
+
+        # Perform the update
+        if iter < 20:
+            momentum = initial_momentum
+        else:
+            momentum = final_momentum
+        gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
+                (gains * 0.8) * ((dY > 0.) == (iY > 0.))
+        gains[gains < min_gain] = min_gain
+        iY = momentum * iY - eta * (gains * dY)
+        Y = Y + iY
+        Y = Y - np.tile(np.mean(Y, 0), (n, 1))
+
+        # Compute current value of cost function
+        # if (iter + 1) % 10 == 0:
+        #     C = np.sum(P * np.log(P / Q))
+        # print("Iteration %d: error is %f" % (iter + 1, C))
+
+        # Stop lying about P-values
+        if iter == 100:
+            P = P / 4.
+
+    # Return solution
+    return Y