未验证 提交 2642aab1 编写于 作者: J Jeff Wang 提交者: GitHub

Embedding doc (#424)

* Use a more light weight method to get PCA and T-SNE. Update embedding documentations. Update loading animation

* Reveal the embedding tab

* Add the auto reload. and fix typos

* Update comments
上级 c1c2232c
......@@ -63,13 +63,11 @@ export default {
title: 'TEXTS',
name: 'texts',
},
/* // Hide the top menu
{
url: '/HighDimensional',
title: 'HighDimensional',
name: 'HighDimensional'
}
*/
name: 'HighDimensional',
},
],
};
},
......
......@@ -7,6 +7,7 @@
:search-text="config.searchText"
:dimension="config.dimension"
:embedding-data="embeddingData"
:show-loading="showLoading"
/>
</div>
<div class="visual-dl-page-right">
......@@ -26,6 +27,9 @@ import autoAdjustHeight from '../common/util/autoAdjustHeight';
import Config from './ui/Config';
import Chart from './ui/Chart';
// the time to refresh chart data
const intervalTime = 30;
export default {
components: {
'ui-config': Config,
......@@ -39,11 +43,12 @@ export default {
searchText: '',
displayWordLabel: true,
dimension: '2',
reduction: 'tsne',
reduction: 'pca',
selectedRun: '',
running: true,
},
embeddingData: [],
showLoading: false,
};
},
created() {
......@@ -55,6 +60,13 @@ export default {
this.config.selectedRun = data[0];
}
});
if (this.config.running) {
this.startInterval();
}
},
beforeDestroy() {
this.stopInterval();
},
watch: {
'config.dimension': function(val) {
......@@ -66,6 +78,9 @@ export default {
'config.selectedRun': function(val) {
this.fetchDatasets();
},
'config.running': function(val) {
val ? this.startInterval() : this.stopInterval();
},
},
mounted() {
autoAdjustHeight();
......@@ -82,7 +97,18 @@ export default {
},
},
methods: {
stopInterval() {
clearInterval(this.getOringDataInterval);
},
// get origin data per {{intervalTime}} seconds
startInterval() {
this.getOringDataInterval = setInterval(() => {
this.fetchDatasets();
}, intervalTime * 1000);
},
fetchDatasets() {
this.showLoading = true;
// Fetch the data from the server. Passing dimension and reduction method
let params = {
dimension: this.config.dimension,
......@@ -90,6 +116,8 @@ export default {
run: this.config.selectedRun,
};
getHighDimensionalDatasets(params).then(({errno, data}) => {
this.showLoading = false;
let vectorData = data.embedding;
let labels = data.labels;
......
......@@ -35,6 +35,10 @@ export default {
type: String,
required: true,
},
showLoading: {
type: Boolean,
required: true,
},
},
data() {
return {
......@@ -53,15 +57,11 @@ export default {
created() {},
mounted() {
this.createChart();
this.myChart.showLoading();
this.set2DChartOptions();
this.setDisplayWordLabel();
},
watch: {
embeddingData: function(val) {
this.myChart.hideLoading();
// Got new data, pass to the filter function to render the 'matched' set and 'not matched' set
this.filterSeriesDataAndSetOption(this.searchText);
},
......@@ -70,7 +70,6 @@ export default {
},
dimension: function(val) {
this.myChart.clear();
this.myChart.showLoading();
if (val === '2') {
this.set2DChartOptions();
this.setDisplayWordLabel();
......@@ -82,6 +81,13 @@ export default {
searchText: function(val) {
this.filterSeriesDataAndSetOption(val);
},
showLoading: function(val) {
if (val) {
this.myChart.showLoading();
} else {
this.myChart.hideLoading();
}
},
},
methods: {
createChart() {
......
......@@ -31,12 +31,13 @@
label="Reduction Method"
v-model="config.reduction"
dark>
<v-radio
label="T-SNE"
value="tsne"/>
<v-radio
label="PCA"
value="pca"/>
<v-radio
label="T-SNE"
value="tsne"/>
</v-radio-group>
<v-radio-group
......
......@@ -253,9 +253,18 @@ PYBIND11_MODULE(core, m) {
.def("total_records", &cp::TextReader::total_records)
.def("size", &cp::TextReader::size);
py::class_<cp::Embedding>(m, "EmbeddingWriter")
py::class_<cp::Embedding>(m, "EmbeddingWriter", R"pbdoc(
PyBind class. Must instantiate through the LogWriter.
)pbdoc")
.def("set_caption", &cp::Embedding::SetCaption)
.def("add_embeddings_with_word_list",
.def(
"add_embeddings_with_word_list"
R"pbdoc(
Add embedding record. Each run can only store one embedding data.
:param embedding: hot vector of embedding words
:type embedding: list
)pbdoc",
&cp::Embedding::AddEmbeddingsWithWordList);
py::class_<cp::EmbeddingReader>(m, "EmbeddingReader")
......
......@@ -143,6 +143,9 @@ class LogReader(object):
return self.reader.get_text(tag)
def embedding(self):
"""
Get the embedding reader.
"""
return self.reader.get_embedding(EMBEDDING_TAG)
def audio(self, tag):
......@@ -292,9 +295,19 @@ class LogWriter(object):
return self.writer.new_text(tag)
def embedding(self):
"""
Create an embedding writer that is used to write
embedding data.
:return: An embedding writer to record embedding data
:rtype: embeddingWriter
"""
return self.writer.new_embedding(EMBEDDING_TAG)
def save(self):
"""
Force the VisualDL to sync with the file system.
"""
self.writer.save()
def __enter__(self):
......
......@@ -307,19 +307,18 @@ def get_embeddings(storage, mode, reduction, dimension=2, num_records=5000):
with storage.mode(mode) as reader:
embedding = reader.embedding()
labels = embedding.get_all_labels()
high_dimensional_vectors = embedding.get_all_embeddings()
high_dimensional_vectors = np.array(embedding.get_all_embeddings())
# TODO: Move away from sklearn
if reduction == 'tsne':
from sklearn.manifold import TSNE
tsne = TSNE(
perplexity=30, n_components=dimension, init='pca', n_iter=5000)
low_dim_embs = tsne.fit_transform(high_dimensional_vectors)
import tsne
low_dim_embs = tsne.tsne(
high_dimensional_vectors,
dimension,
initial_dims=50,
perplexity=30.0)
elif reduction == 'pca':
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
low_dim_embs = pca.fit_transform(high_dimensional_vectors)
low_dim_embs = simple_pca(high_dimensional_vectors, dimension)
return {"embedding": low_dim_embs.tolist(), "labels": labels}
......@@ -393,3 +392,26 @@ def cache_get(cache):
return data
return _handler
def simple_pca(x, dimension):
"""
A simple PCA implementation to do the dimension reduction.
"""
# Center the data.
x -= np.mean(x, axis=0)
# Computing the Covariance Matrix
cov = np.cov(x, rowvar=False)
# Get eigenvectors and eigenvalues from the covariance matrix
eigvals, eigvecs = np.linalg.eig(cov)
# Sort the eigvals from high to low
order = np.argsort(eigvals)[::-1]
# Drop the eigenvectors with low eigenvalues
eigvecs = eigvecs[:, order[:dimension]]
return np.dot(x, eigvecs)
#
# tsne.py
#
# Implementation of t-SNE in Python. The implementation was tested on Python
# 2.7.10, and it requires a working installation of NumPy. The implementation
# comes with an example on the MNIST dataset. In order to plot the
# results of this example, a working installation of matplotlib is required.
#
# The example can be run by executing: `ipython tsne.py`
#
#
# Created by Laurens van der Maaten on 20-12-08.
# Copyright (c) 2008 Tilburg University. All rights reserved.
# Editor's note: Thanks https://lvdmaaten.github.io/tsne/
# For allowing it to be free to use, modify, or redistribute for non-commercial purposes
# This file is modified to remove some print out.
import numpy as np
def Hbeta(D=np.array([]), beta=1.0):
"""
Compute the perplexity and the P-row for a specific value of the
precision of a Gaussian distribution.
"""
# Compute P-row and corresponding perplexity
P = np.exp(-D.copy() * beta)
sumP = sum(P)
H = np.log(sumP) + beta * np.sum(D * P) / sumP
P = P / sumP
return H, P
def x2p(X=np.array([]), tol=1e-5, perplexity=30.0):
"""
Performs a binary search to get P-values in such a way that each
conditional Gaussian has the same perplexity.
"""
# Initialize some variables
print("Computing pairwise distances...")
(n, d) = X.shape
sum_X = np.sum(np.square(X), 1)
D = np.add(np.add(-2 * np.dot(X, X.T), sum_X).T, sum_X)
P = np.zeros((n, n))
beta = np.ones((n, 1))
logU = np.log(perplexity)
# Loop over all datapoints
for i in range(n):
# Print progress
# if i % 500 == 0:
# print("Computing P-values for point %d of %d..." % (i, n))
# Compute the Gaussian kernel and entropy for the current precision
betamin = -np.inf
betamax = np.inf
Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))]
(H, thisP) = Hbeta(Di, beta[i])
# Evaluate whether the perplexity is within tolerance
Hdiff = H - logU
tries = 0
while np.abs(Hdiff) > tol and tries < 50:
# If not, increase or decrease precision
if Hdiff > 0:
betamin = beta[i].copy()
if betamax == np.inf or betamax == -np.inf:
beta[i] = beta[i] * 2.
else:
beta[i] = (beta[i] + betamax) / 2.
else:
betamax = beta[i].copy()
if betamin == np.inf or betamin == -np.inf:
beta[i] = beta[i] / 2.
else:
beta[i] = (beta[i] + betamin) / 2.
# Recompute the values
(H, thisP) = Hbeta(Di, beta[i])
Hdiff = H - logU
tries += 1
# Set the final row of P
P[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisP
# Return final P-matrix
print("Mean value of sigma: %f" % np.mean(np.sqrt(1 / beta)))
return P
def pca(X=np.array([]), no_dims=50):
"""
Runs PCA on the NxD array X in order to reduce its dimensionality to
no_dims dimensions.
"""
(n, d) = X.shape
X = X - np.tile(np.mean(X, 0), (n, 1))
(l, M) = np.linalg.eig(np.dot(X.T, X))
Y = np.dot(X, M[:, 0:no_dims])
return Y
def tsne(X=np.array([]), no_dims=2, initial_dims=50, perplexity=30.0):
"""
Runs t-SNE on the dataset in the NxD array X to reduce its
dimensionality to no_dims dimensions. The syntaxis of the function is
`Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.
"""
# Check inputs
if isinstance(no_dims, float):
print("Error: array X should have type float.")
return -1
if round(no_dims) != no_dims:
print("Error: number of dimensions should be an integer.")
return -1
# Initialize variables
X = pca(X, initial_dims).real
(n, d) = X.shape
max_iter = 1000
initial_momentum = 0.5
final_momentum = 0.8
eta = 500
min_gain = 0.01
Y = np.random.randn(n, no_dims)
dY = np.zeros((n, no_dims))
iY = np.zeros((n, no_dims))
gains = np.ones((n, no_dims))
# Compute P-values
P = x2p(X, 1e-5, perplexity)
P = P + np.transpose(P)
P = P / np.sum(P)
P = P * 4. # early exaggeration
P = np.maximum(P, 1e-12)
# Run iterations
for iter in range(max_iter):
# Compute pairwise affinities
sum_Y = np.sum(np.square(Y), 1)
num = -2. * np.dot(Y, Y.T)
num = 1. / (1. + np.add(np.add(num, sum_Y).T, sum_Y))
num[range(n), range(n)] = 0.
Q = num / np.sum(num)
Q = np.maximum(Q, 1e-12)
# Compute gradient
PQ = P - Q
for i in range(n):
dY[i, :] = np.sum(
np.tile(PQ[:, i] * num[:, i], (no_dims, 1)).T * (Y[i, :] - Y),
0)
# Perform the update
if iter < 20:
momentum = initial_momentum
else:
momentum = final_momentum
gains = (gains + 0.2) * ((dY > 0.) != (iY > 0.)) + \
(gains * 0.8) * ((dY > 0.) == (iY > 0.))
gains[gains < min_gain] = min_gain
iY = momentum * iY - eta * (gains * dY)
Y = Y + iY
Y = Y - np.tile(np.mean(Y, 0), (n, 1))
# Compute current value of cost function
# if (iter + 1) % 10 == 0:
# C = np.sum(P * np.log(P / Q))
# print("Iteration %d: error is %f" % (iter + 1, C))
# Stop lying about P-values
if iter == 100:
P = P / 4.
# Return solution
return Y
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册