提交 fa4ada32 编写于 作者: T Tomas Matousek

Refactor ServiceHub service connection

上级 9bb6f239
......@@ -17,17 +17,21 @@ internal sealed partial class ServiceHubRemoteHostClient : RemoteHostClient
{
internal static class Connections
{
private static readonly TimeSpan s_reportTimeout = TimeSpan.FromMinutes(10);
private static bool s_timeoutReported = false;
/// <summary>
/// call <paramref name="funcAsync"/> and retry up to <paramref name="timeout"/> if the call throws
/// <typeparamref name="TException"/>. any other exception from the call won't be handled here.
/// Wrap <see cref="HubClient.RequestServiceAsync"/> since we can't control its internal timeout value ourselves.
/// See https://devdiv.visualstudio.com/DefaultCollection/DevDiv/Editor/_workitems?id=378757&fullScreen=false&_a=edit
/// </summary>
public static async Task<TResult> RetryRemoteCallAsync<TException, TResult>(
private static async Task<Stream> RequestServiceWithCancellationRetryAsync(
Workspace workspace,
Func<Task<TResult>> funcAsync,
HubClient client,
ServiceDescriptor descriptor,
TimeSpan timeout,
CancellationToken cancellationToken) where TException : Exception
CancellationToken cancellationToken)
{
const int retry_delayInMS = 50;
const int RetryDelayInMS = 50;
using (var pooledStopwatch = SharedPools.Default<Stopwatch>().GetPooledObject())
{
......@@ -40,18 +44,25 @@ internal static class Connections
try
{
return await funcAsync().ConfigureAwait(false);
return await client.RequestServiceAsync(descriptor, cancellationToken).ConfigureAwait(false);
}
catch (TException)
catch (OperationCanceledException e) when (e.CancellationToken != cancellationToken)
{
// throw cancellation token if operation is cancelled
cancellationToken.ThrowIfCancellationRequested();
// Retry on cancellation that is not sourced by our cancellation token.
// Since HubClient will throw when it can't connect to service hub service (e.g. timeout, disposal).
}
// wait for retry_delayInMS before next try
await Task.Delay(retry_delayInMS, cancellationToken).ConfigureAwait(false);
// wait before next try
await Task.Delay(RetryDelayInMS, cancellationToken).ConfigureAwait(false);
ReportTimeout(watch);
// if we tried for more than 10 mins and still couldn't connect, report non-fatal Watson
if (!s_timeoutReported && watch.Elapsed > s_reportTimeout)
{
s_timeoutReported = true;
// report service hub logs along with dump
new Exception("RequestServiceAsync Timeout").ReportServiceHubNFW("RequestServiceAsync Timeout");
}
}
}
......@@ -72,34 +83,27 @@ internal static class Connections
TimeSpan timeout,
CancellationToken cancellationToken)
{
const int max_retry = 10;
const int retry_delayInMS = 50;
const int MaxRetryAttempts = 10;
const int RetryDelayInMS = 50;
RemoteInvocationException lastException = null;
var descriptor = new ServiceDescriptor(serviceName) { HostGroup = hostGroup };
// call to get service can fail due to this bug - devdiv#288961 or more.
// Call to get service can fail due to this bug - devdiv#288961 or more.
// until root cause is fixed, we decide to have retry rather than fail right away
for (var i = 0; i < max_retry; i++)
//
// We have double re-try here. We have these 2 separated since 2 retries are for different problems.
// First retry most likely deal with real issue on ServiceHub, second retry (cancellation) is to deal with
// ServiceHub behavior we don't want to use.
for (var i = 0; i < MaxRetryAttempts; i++)
{
try
{
// we are wrapping HubClient.RequestServiceAsync since we can't control its internal timeout value ourselves.
// we have bug opened to track the issue.
// https://devdiv.visualstudio.com/DefaultCollection/DevDiv/Editor/_workitems?id=378757&fullScreen=false&_a=edit
// retry on cancellation token since HubClient will throw its own cancellation token
// when it couldn't connect to service hub service for some reasons
// (ex, OOP process GC blocked and not responding to request)
//
// we have double re-try here. we have these 2 separated since 2 retries are for different problems.
// as noted by 2 different issues above at the start of each 2 different retries.
// first retry most likely deal with real issue on servicehub, second retry (cancellation) is to deal with
// by design servicehub behavior we don't want to use.
return await RetryRemoteCallAsync<OperationCanceledException, Stream>(
return await RequestServiceWithCancellationRetryAsync(
workspace,
() => client.RequestServiceAsync(descriptor, cancellationToken),
client,
descriptor,
timeout,
cancellationToken).ConfigureAwait(false);
}
......@@ -116,8 +120,8 @@ internal static class Connections
}
}
// wait for retry_delayInMS before next try
await Task.Delay(retry_delayInMS, cancellationToken).ConfigureAwait(false);
// wait before next try
await Task.Delay(RetryDelayInMS, cancellationToken).ConfigureAwait(false);
}
RemoteHostCrashInfoBar.ShowInfoBar(workspace, lastException);
......@@ -126,24 +130,6 @@ internal static class Connections
// we had enough feedback from users not to crash VS on servicehub failure
throw new SoftCrashException("RequestServiceAsync Failed", lastException, cancellationToken);
}
#region code related to make diagnosis easier later
private static readonly TimeSpan s_reportTimeout = TimeSpan.FromMinutes(10);
private static bool s_timeoutReported = false;
private static void ReportTimeout(Stopwatch watch)
{
// if we tried for 10 min and still couldn't connect. NFW (non fatal watson) some data
if (!s_timeoutReported && watch.Elapsed > s_reportTimeout)
{
s_timeoutReported = true;
// report service hub logs along with dump
(new Exception("RequestServiceAsync Timeout")).ReportServiceHubNFW("RequestServiceAsync Timeout");
}
}
#endregion
}
}
}
......@@ -40,99 +40,82 @@ private enum GlobalNotificationState
/// stop prior to hearing about the relevant start).
/// </summary>
private readonly object _globalNotificationsGate = new object();
private Task<GlobalNotificationState> _globalNotificationsTask = Task.FromResult(GlobalNotificationState.NotStarted);
private readonly Task<GlobalNotificationState> _globalNotificationsTask = Task.FromResult(GlobalNotificationState.NotStarted);
public static async Task<RemoteHostClient?> CreateAsync(
Workspace workspace, CancellationToken cancellationToken)
public static async Task<ServiceHubRemoteHostClient?> CreateAsync(Workspace workspace, CancellationToken cancellationToken)
{
try
using (Logger.LogBlock(FunctionId.ServiceHubRemoteHostClient_CreateAsync, cancellationToken))
{
using (Logger.LogBlock(FunctionId.ServiceHubRemoteHostClient_CreateAsync, cancellationToken))
{
var primary = new HubClient("ManagedLanguage.IDE.RemoteHostClient");
var timeout = TimeSpan.FromMilliseconds(workspace.Options.GetOption(RemoteHostOptions.RequestServiceTimeoutInMS));
// Retry (with timeout) until we can connect to RemoteHost (service hub process).
// we are seeing cases where we failed to connect to service hub process when a machine is under heavy load.
// (see https://devdiv.visualstudio.com/DevDiv/_workitems/edit/481103 as one of example)
var instance = await Connections.RetryRemoteCallAsync<IOException, ServiceHubRemoteHostClient>(
workspace, () => CreateWorkerAsync(workspace, primary, timeout, cancellationToken), timeout, cancellationToken).ConfigureAwait(false);
instance.Started();
// return instance
return instance;
}
}
catch (SoftCrashException)
{
// at this point, we should have shown info bar (RemoteHostCrashInfoBar.ShowInfoBar) to users
// returning null here will disable OOP for this VS session.
// * Note * this is not trying to recover the exception. but giving users to time
// to clean up before restart VS
return null;
}
}
var timeout = TimeSpan.FromMilliseconds(workspace.Options.GetOption(RemoteHostOptions.RequestServiceTimeoutInMS));
var enableConnectionPool = workspace.Options.GetOption(RemoteHostOptions.EnableConnectionPool);
var maxConnection = workspace.Options.GetOption(RemoteHostOptions.MaxPoolConnection);
public static async Task<ServiceHubRemoteHostClient> CreateWorkerAsync(Workspace workspace, HubClient primary, TimeSpan timeout, CancellationToken cancellationToken)
{
ServiceHubRemoteHostClient? client = null;
try
{
// let each client to have unique id so that we can distinguish different clients when service is restarted
var current = CreateClientId(Process.GetCurrentProcess().Id.ToString());
var clientId = CreateClientId(Process.GetCurrentProcess().Id.ToString());
var hostGroup = new HostGroup(current);
var hostGroup = new HostGroup(clientId);
var primary = new HubClient("ManagedLanguage.IDE.RemoteHostClient");
// Create the RemotableDataJsonRpc before we create the remote host: this call implicitly sets up the remote IExperimentationService so that will be available for later calls
var remotableDataRpc = new RemotableDataJsonRpc(
workspace, primary.Logger,
await Connections.RequestServiceAsync(workspace, primary, WellKnownServiceHubServices.SnapshotService, hostGroup, timeout, cancellationToken).ConfigureAwait(false));
ServiceHubRemoteHostClient? client = null;
try
{
// Create the RemotableDataJsonRpc before we create the remote host: this call implicitly sets up the remote IExperimentationService so that will be available for later calls
var snapshotServiceStream = await Connections.RequestServiceAsync(workspace, primary, WellKnownServiceHubServices.SnapshotService, hostGroup, timeout, cancellationToken).ConfigureAwait(false);
var remoteHostStream = await Connections.RequestServiceAsync(workspace, primary, WellKnownRemoteHostServices.RemoteHostService, hostGroup, timeout, cancellationToken).ConfigureAwait(false);
var remoteHostStream = await Connections.RequestServiceAsync(workspace, primary, WellKnownRemoteHostServices.RemoteHostService, hostGroup, timeout, cancellationToken).ConfigureAwait(false);
var remotableDataRpc = new RemotableDataJsonRpc(workspace, primary.Logger, snapshotServiceStream);
var connectionManager = new ConnectionManager(primary, hostGroup, enableConnectionPool, maxConnection, timeout, new ReferenceCountedDisposable<RemotableDataJsonRpc>(remotableDataRpc));
var enableConnectionPool = workspace.Options.GetOption(RemoteHostOptions.EnableConnectionPool);
var maxConnection = workspace.Options.GetOption(RemoteHostOptions.MaxPoolConnection);
client = new ServiceHubRemoteHostClient(workspace, primary.Logger, connectionManager, remoteHostStream);
var connectionManager = new ConnectionManager(primary, hostGroup, enableConnectionPool, maxConnection, timeout, new ReferenceCountedDisposable<RemotableDataJsonRpc>(remotableDataRpc));
var uiCultureLCID = CultureInfo.CurrentUICulture.LCID;
var cultureLCID = CultureInfo.CurrentCulture.LCID;
client = new ServiceHubRemoteHostClient(workspace, primary.Logger, connectionManager, remoteHostStream);
// make sure connection is done right
var host = await client._rpc.InvokeWithCancellationAsync<string>(
nameof(IRemoteHostService.Connect), new object[] { clientId, uiCultureLCID, cultureLCID, TelemetryService.DefaultSession.SerializeSettings() }, cancellationToken).ConfigureAwait(false);
var uiCultureLCID = CultureInfo.CurrentUICulture.LCID;
var cultureLCID = CultureInfo.CurrentCulture.LCID;
client.Started();
// make sure connection is done right
var host = await client._rpc.InvokeWithCancellationAsync<string>(
nameof(IRemoteHostService.Connect), new object[] { current, uiCultureLCID, cultureLCID, TelemetryService.DefaultSession.SerializeSettings() }, cancellationToken).ConfigureAwait(false);
return client;
}
catch (ConnectionLostException ex)
{
RemoteHostCrashInfoBar.ShowInfoBar(workspace, ex);
return client;
}
catch (ConnectionLostException ex)
{
RemoteHostCrashInfoBar.ShowInfoBar(workspace, ex);
Shutdown(ex);
Shutdown(client, ex, cancellationToken);
// dont crash VS because OOP is failed to start. we will show info bar telling users to restart
// but never physically crash VS.
return null;
}
catch (SoftCrashException ex)
{
Shutdown(ex);
// dont crash VS because OOP is failed to start. we will show info bar telling users to restart
// but never physically crash VS.
throw new SoftCrashException("Connection Lost", ex, cancellationToken);
}
catch (Exception ex)
{
Shutdown(client, ex, cancellationToken);
throw;
}
// at this point, we should have shown info bar (RemoteHostCrashInfoBar.ShowInfoBar) to users
// returning null here will disable OOP for this VS session.
// * Note * this is not trying to recover the exception. but giving users to time
// to clean up before restart VS
return null;
}
catch (Exception ex)
{
Shutdown(ex);
throw;
}
static void Shutdown(ServiceHubRemoteHostClient? client, Exception ex, CancellationToken cancellationToken)
{
// make sure we shutdown client if initializing client has failed.
client?.Shutdown();
void Shutdown(Exception ex)
{
// make sure we shutdown client if initializing client has failed.
client?.Shutdown();
// translate to our own cancellation if it is raised.
cancellationToken.ThrowIfCancellationRequested();
// translate to our own cancellation if it is raised.
cancellationToken.ThrowIfCancellationRequested();
// otherwise, report watson
ex.ReportServiceHubNFW("ServiceHub creation failed");
// otherwise, report watson
ex.ReportServiceHubNFW("ServiceHub creation failed");
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册